- 
                Notifications
    
You must be signed in to change notification settings  - Fork 15.1k
 
[LoopPeel] Remove known trip count restriction when peeling last. #140792
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
| 
          
 @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesRemove the restriction that the loop must be known to execute at least 2 iterations when peeling the last iteration. If we cannot prove at least 2 iterations are executed, a check and branch to skip the peeled loop is inserted. Full diff: https://github.com/llvm/llvm-project/pull/140792.diff 4 Files Affected: 
 diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 4eaa3c9714370..4422f7cd9480b 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -37,6 +37,7 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -332,11 +333,7 @@ bool llvm::canPeelLastIteration(const Loop &L, ScalarEvolution &SE) {
   CmpPredicate Pred;
   BasicBlock *Succ1;
   BasicBlock *Succ2;
-  // The loop must execute at least 2 iterations to guarantee that peeled
-  // iteration executes.
-  // TODO: Add checks during codegen.
-  if (isa<SCEVCouldNotCompute>(BTC) ||
-      !SE.isKnownPredicate(CmpInst::ICMP_UGT, BTC, SE.getZero(BTC->getType())))
+  if (isa<SCEVCouldNotCompute>(BTC))
     return false;
 
   // Check if the exit condition of the loop can be adjusted by the peeling
@@ -818,7 +815,7 @@ static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
 /// instructions in the last peeled-off iteration.
 static void cloneLoopBlocks(
     Loop *L, unsigned IterNumber, bool PeelLast, BasicBlock *InsertTop,
-    BasicBlock *InsertBot,
+    BasicBlock *InsertBot, BasicBlock *OrigPreHeader,
     SmallVectorImpl<std::pair<BasicBlock *, BasicBlock *>> &ExitEdges,
     SmallVectorImpl<BasicBlock *> &NewBlocks, LoopBlocksDFS &LoopBlocks,
     ValueToValueMapTy &VMap, ValueToValueMapTy &LVMap, DominatorTree *DT,
@@ -912,10 +909,19 @@ static void cloneLoopBlocks(
   if (PeelLast) {
     // For the last iteration, we use the value from the latch of the original
     // loop directly.
+    //
+    IRBuilder<> B(InsertTop->getTerminator());
     for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
       PHINode *NewPHI = cast<PHINode>(VMap[&*I]);
-      VMap[&*I] = NewPHI->getIncomingValueForBlock(Latch);
+      PHINode *PN = B.CreatePHI(NewPHI->getType(), 2);
       NewPHI->eraseFromParent();
+      if (OrigPreHeader)
+        PN->addIncoming(cast<PHINode>(&*I)->getIncomingValueForBlock(PreHeader),
+                        OrigPreHeader);
+
+      PN->addIncoming(cast<PHINode>(&*I)->getIncomingValueForBlock(Latch),
+                      Latch);
+      VMap[&*I] = PN;
     }
   } else {
     // For the first iteration, we use the value from the preheader directly.
@@ -1049,7 +1055,7 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
   // Set up all the necessary basic blocks.
   BasicBlock *InsertTop;
   BasicBlock *InsertBot;
-  BasicBlock *NewPreHeader;
+  BasicBlock *NewPreHeader = nullptr;
   DenseMap<Instruction *, Value *> ExitValues;
   if (PeelLast) {
     // It is convenient to split the single exit block from the latch the
@@ -1080,11 +1086,40 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
     for (PHINode &P : Exit->phis())
       ExitValues[&P] = P.getIncomingValueForBlock(Latch);
 
+    const SCEV *BTC = SE->getBackedgeTakenCount(L);
+
     InsertTop = SplitEdge(Latch, Exit, &DT, LI);
     InsertBot = SplitBlock(InsertTop, InsertTop->getTerminator(), &DT, LI);
 
     InsertTop->setName(Exit->getName() + ".peel.begin");
     InsertBot->setName(Exit->getName() + ".peel.next");
+    NewPreHeader = nullptr;
+
+    // If the original loop may only execute a single iteration we need to
+    // insert a trip count check and skip the peeled loop if necessary.
+    if (!SE->isKnownPredicate(CmpInst::ICMP_UGT, BTC,
+                              SE->getZero(BTC->getType()))) {
+      NewPreHeader = SplitEdge(PreHeader, Header, &DT, LI);
+      SCEVExpander Expander(*SE, Latch->getDataLayout(), "loop-peel");
+
+      BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+      Value *BTCValue =
+          Expander.expandCodeFor(BTC, BTC->getType(), PreHeaderBR);
+      IRBuilder<> B(PreHeaderBR);
+      Value *Cond =
+          B.CreateICmpNE(BTCValue, ConstantInt::get(BTCValue->getType(), 0));
+      B.CreateCondBr(Cond, NewPreHeader, InsertTop);
+      PreHeaderBR->eraseFromParent();
+
+      // PreHeader now dominates InsertTop.
+      DT.changeImmediateDominator(InsertTop, PreHeader);
+
+      // If we branch from PreHeader to InsertTop, we are guaranteed to execute
+      // the peeled iteration, so the exit values from the original loop are
+      // dead. Use poison for them.
+      for (auto &PN : InsertTop->phis())
+        PN.addIncoming(PoisonValue::get(PN.getType()), PreHeader);
+    }
   } else {
     // It is convenient to split the preheader into 3 parts - two blocks to
     // anchor the peeled copy of the loop body, and a new preheader for the
@@ -1158,8 +1193,9 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, bool PeelLast, LoopInfo *LI,
   for (unsigned Iter = 0; Iter < PeelCount; ++Iter) {
     SmallVector<BasicBlock *, 8> NewBlocks;
 
-    cloneLoopBlocks(L, Iter, PeelLast, InsertTop, InsertBot, ExitEdges,
-                    NewBlocks, LoopBlocks, VMap, LVMap, &DT, LI,
+    cloneLoopBlocks(L, Iter, PeelLast, InsertTop, InsertBot,
+                    NewPreHeader ? PreHeader : nullptr, ExitEdges, NewBlocks,
+                    LoopBlocks, VMap, LVMap, &DT, LI,
                     LoopLocalNoAliasDeclScopes, *SE);
 
     // Remap to use values from the current iteration instead of the
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-constant-trip-count.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-constant-trip-count.ll
index f1290069bda0c..2e724e748aa67 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-constant-trip-count.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-constant-trip-count.ll
@@ -12,8 +12,8 @@ define i64 @peel_single_block_loop_iv_step_1() {
 ; CHECK-NEXT:    [[EC1:%.*]] = icmp ne i64 [[IV_NEXT1]], 63
 ; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[EXIT_PEEL_BEGIN]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT1]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT1]], %[[LOOP]] ]
 ; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
 ; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[CMP18_NOT:%.*]] = icmp eq i64 [[IV]], 63
@@ -91,8 +91,8 @@ define i64 @peel_single_block_loop_iv_step_1_eq_pred() {
 ; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_LCSSA]], 63
 ; CHECK-NEXT:    br i1 [[CMP_PEEL]], label %[[EXIT_PEEL_BEGIN:.*]], label %[[LOOP]], !llvm.loop [[LOOP2:![0-9]+]]
 ; CHECK:       [[EXIT_PEEL_BEGIN]]:
-; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_LCSSA]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_LCSSA1:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_LCSSA]], %[[LOOP]] ]
 ; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
 ; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[CMP_PEEL1:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63
@@ -170,8 +170,8 @@ define i64 @peel_single_block_loop_iv_step_1_nested_loop() {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 63
 ; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[OUTER_LATCH_PEEL_BEGIN:.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[OUTER_LATCH_PEEL_BEGIN]]:
-; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_NEXT]], %[[LOOP]] ]
 ; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
 ; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63
@@ -236,8 +236,8 @@ define i64 @peel_multi_block_loop_iv_step_1() {
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i64 [[IV_NEXT]], 63
 ; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[EXIT_PEEL_BEGIN]]:
-; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_NEXT]], %[[LATCH]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV]], %[[LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i64 [ [[IV_NEXT]], %[[LATCH]] ]
 ; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
 ; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[CMP_PEEL:%.*]] = icmp eq i64 [[IV_NEXT_LCSSA]], 63
@@ -364,8 +364,8 @@ define i64 @peel_single_block_loop_iv_step_1_btc_1() {
 ; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    br i1 false, label %[[LOOP]], label %[[EXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT_PEEL_BEGIN]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT1]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[IV_LCSSA:%.*]] = phi i64 [ [[IV1]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT1]], %[[LOOP]] ]
 ; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
 ; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[IV]], 1
@@ -483,9 +483,9 @@ define i32 @peel_loop_with_branch_and_phi_uses(ptr %x, i1 %c) {
 ; CHECK-NEXT:    [[EC1:%.*]] = icmp ne i32 [[IV_NEXT1]], 99
 ; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP_HEADER]], label %[[LOOPEXIT_PEEL_BEGIN:.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[LOOPEXIT_PEEL_BEGIN]]:
-; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD1]], %[[LOOP_LATCH]] ]
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ [[ADD1]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP_LATCH]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD1]], %[[LOOP_LATCH]] ]
 ; CHECK-NEXT:    br label %[[LOOP_HEADER_PEEL:.*]]
 ; CHECK:       [[LOOP_HEADER_PEEL]]:
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[IV]], 99
diff --git a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
index 75f3674732f35..92c1e1e47d8bd 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-last-iteration-with-variable-trip-count.ll
@@ -9,18 +9,38 @@ define i32 @peel_last_with_trip_count_check_lcssa_phi(i32 %n) {
 ; CHECK-SAME: i32 [[N:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[N]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp ne i32 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[ENTRY_SPLIT:.*]], label %[[EXIT_PEEL_BEGIN:.*]]
+; CHECK:       [[ENTRY_SPLIT]]:
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[ENTRY_SPLIT]] ], [ [[IV_NEXT1:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    call void @foo(i32 2)
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw i32 [[IV1]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT:    [[EC1:%.*]] = icmp ne i32 [[IV_NEXT1]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[EC1]], label %[[LOOP]], label %[[EXIT_PEEL_BEGIN_LOOPEXIT:.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[EXIT_PEEL_BEGIN_LOOPEXIT]]:
+; CHECK-NEXT:    [[SEL_LCSSA_PH:%.*]] = phi i32 [ 2, %[[LOOP]] ]
+; CHECK-NEXT:    [[DOTPH:%.*]] = phi i32 [ [[IV_NEXT1]], %[[LOOP]] ]
+; CHECK-NEXT:    br label %[[EXIT_PEEL_BEGIN]]
+; CHECK:       [[EXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[SEL_LCSSA_PH]], %[[EXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[DOTPH]], %[[EXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT:    br label %[[LOOP_PEEL:.*]]
+; CHECK:       [[LOOP_PEEL]]:
 ; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[IV]], [[SUB]]
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[C]], i32 1, i32 2
 ; CHECK-NEXT:    call void @foo(i32 [[SEL]])
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i32 [[IV]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp ne i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[LOOP]], label %[[EXIT:.*]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_PEEL_NEXT:.*]], label %[[EXIT_PEEL_NEXT]]
+; CHECK:       [[EXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[LOOP_PEEL_NEXT:.*]]
+; CHECK:       [[LOOP_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
 ; CHECK:       [[EXIT]]:
-; CHECK-NEXT:    [[SEL_LCSSA:%.*]] = phi i32 [ [[SEL]], %[[LOOP]] ]
-; CHECK-NEXT:    ret i32 [[SEL_LCSSA]]
+; CHECK-NEXT:    ret i32 [[SEL]]
 ;
 entry:
   %sub = add i32 %n, -1
@@ -160,21 +180,44 @@ define void @peel_last_with_trip_count_check_nested_loop(i32 %n) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[N]], -1
 ; CHECK-NEXT:    br label %[[OUTER_HEADER:.*]]
-; CHECK:       [[OUTER_HEADER_LOOPEXIT:.*]]:
+; CHECK:       [[OUTER_HEADER_LOOPEXIT_PEEL_BEGIN_LOOPEXIT:.*]]:
+; CHECK-NEXT:    [[DOTPH:%.*]] = phi i32 [ [[IV_NEXT1:%.*]], %[[INNER_LATCH:.*]] ]
+; CHECK-NEXT:    br label %[[OUTER_HEADER_LOOPEXIT_PEEL_BEGIN:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT_PEEL_BEGIN]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[OUTER_HEADER]] ], [ [[DOTPH]], %[[OUTER_HEADER_LOOPEXIT_PEEL_BEGIN_LOOPEXIT]] ]
+; CHECK-NEXT:    br label %[[INNER_HEADER_PEEL:.*]]
+; CHECK:       [[INNER_HEADER_PEEL]]:
+; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[IV]], [[SUB]]
+; CHECK-NEXT:    br i1 [[C]], label %[[INNER_LATCH_PEEL:.*]], label %[[THEN_PEEL:.*]]
+; CHECK:       [[THEN_PEEL]]:
+; CHECK-NEXT:    call void @foo(i32 1)
+; CHECK-NEXT:    br label %[[INNER_LATCH_PEEL]]
+; CHECK:       [[INNER_LATCH_PEEL]]:
+; CHECK-NEXT:    [[IV_NEXT:%.*]] = add i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[OUTER_HEADER_LOOPEXIT_PEEL_NEXT:.*]], label %[[OUTER_HEADER_LOOPEXIT_PEEL_NEXT]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[INNER_HEADER_PEEL_NEXT:.*]]
+; CHECK:       [[INNER_HEADER_PEEL_NEXT]]:
+; CHECK-NEXT:    br label %[[OUTER_HEADER_LOOPEXIT:.*]]
+; CHECK:       [[OUTER_HEADER_LOOPEXIT]]:
 ; CHECK-NEXT:    br label %[[OUTER_HEADER]]
 ; CHECK:       [[OUTER_HEADER]]:
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i32 [[SUB]], 0
+; CHECK-NEXT:    br i1 [[TMP1]], label %[[OUTER_HEADER_SPLIT:.*]], label %[[OUTER_HEADER_LOOPEXIT_PEEL_BEGIN]]
+; CHECK:       [[OUTER_HEADER_SPLIT]]:
 ; CHECK-NEXT:    br label %[[INNER_HEADER:.*]]
 ; CHECK:       [[INNER_HEADER]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ 0, %[[OUTER_HEADER]] ], [ [[IV_NEXT:%.*]], %[[INNER_LATCH:.*]] ]
-; CHECK-NEXT:    [[C:%.*]] = icmp eq i32 [[IV]], [[SUB]]
-; CHECK-NEXT:    br i1 [[C]], label %[[INNER_LATCH]], label %[[THEN:.*]]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i32 [ 0, %[[OUTER_HEADER_SPLIT]] ], [ [[IV_NEXT1]], %[[INNER_LATCH]] ]
+; CHECK-NEXT:    br i1 false, label %[[INNER_LATCH]], label %[[THEN:.*]]
 ; CHECK:       [[THEN]]:
 ; CHECK-NEXT:    call void @foo(i32 1)
 ; CHECK-NEXT:    br label %[[INNER_LATCH]]
 ; CHECK:       [[INNER_LATCH]]:
-; CHECK-NEXT:    [[IV_NEXT]] = add i32 [[IV]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[OUTER_HEADER_LOOPEXIT]], label %[[INNER_HEADER]]
+; CHECK-NEXT:    [[IV_NEXT1]] = add nuw i32 [[IV1]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = sub i32 [[N]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT1:%.*]] = icmp eq i32 [[IV_NEXT1]], [[TMP2]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT1]], label %[[OUTER_HEADER_LOOPEXIT_PEEL_BEGIN_LOOPEXIT]], label %[[INNER_HEADER]], !llvm.loop [[LOOP2:![0-9]+]]
 ;
 entry:
   %sub = add i32 %n, -1
@@ -197,3 +240,8 @@ inner.latch:
   %exitcond.not = icmp eq i32 %iv.next, %n
   br i1 %exitcond.not, label %outer.header, label %inner.header
 }
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.peeled.count", i32 1}
+; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
+;.
diff --git a/llvm/test/Transforms/LoopUnroll/unroll-and-peel-last-iteration.ll b/llvm/test/Transforms/LoopUnroll/unroll-and-peel-last-iteration.ll
index f57fb2d9b7057..e36b834969c87 100644
--- a/llvm/test/Transforms/LoopUnroll/unroll-and-peel-last-iteration.ll
+++ b/llvm/test/Transforms/LoopUnroll/unroll-and-peel-last-iteration.ll
@@ -19,8 +19,8 @@ define i32 @peel_last_iter_of_outer_lcssa_phi_with_constant_after_unrolling_inne
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i16 [[IV_NEXT]], 999
 ; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT_PEEL_BEGIN:.*]], label %[[OUTER_HEADER]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[EXIT_PEEL_BEGIN]]:
-; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i16 [ [[IV_NEXT]], %[[INNER_LATCH]] ]
 ; CHECK-NEXT:    [[DOTLCSSA_LCSSA:%.*]] = phi i32 [ 1, %[[INNER_LATCH]] ]
+; CHECK-NEXT:    [[IV_NEXT_LCSSA:%.*]] = phi i16 [ [[IV_NEXT]], %[[INNER_LATCH]] ]
 ; CHECK-NEXT:    br label %[[OUTER_HEADER_PEEL:.*]]
 ; CHECK:       [[OUTER_HEADER_PEEL]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i16 [[IV_NEXT_LCSSA]], 999
 | 
    
| // TODO: Add checks during codegen. | ||
| if (isa<SCEVCouldNotCompute>(BTC) || | ||
| !SE.isKnownPredicate(CmpInst::ICMP_UGT, BTC, SE.getZero(BTC->getType()))) | ||
| if (isa<SCEVCouldNotCompute>(BTC)) | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should we be checking isHighCostExpansion() or something like that somewhere?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, updated thanks!
| 
               | 
          ||
| // If the original loop may only execute a single iteration we need to | ||
| // insert a trip count check and skip the peeled loop if necessary. | ||
| if (!SE->isKnownPredicate(CmpInst::ICMP_UGT, BTC, | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
UGT -> NE
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated to use isKnownNonZero, which is a bit more compact
| // If we branch from PreHeader to InsertTop, we are guaranteed to execute | ||
| // the peeled iteration, so the exit values from the original loop are | ||
| // dead. Use poison for them. | ||
| for (auto &PN : InsertTop->phis()) | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Wait, don't these need to the be the starting values for the loops IVs? If we branch directly to the peeled iteration, don't we need to have the phis start with the original start values?
Note the comment is a tad confusing. The loop exit values are poison, but the incoming value isn't coming from the loop body, it's coming from the preheader. Unless I'm getting confused from the variable naming here?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The phis here are the LCSSA phis exiting the original loop. But the uses of the exit phis get replaced with new exit values later on. I update the code there to just remove the unused phi nodes, then there's nothing to do here.
| // resolve this statically: | ||
| if (PeelLast) { | ||
| // For the last iteration, we use the value from the latch of the original | ||
| // loop directly. | 
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Update the comment to include the new case please.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated, thanks!
Additional test to #140792 with different SCEV expansion costs.
…ivial BTC. Additional test to llvm/llvm-project#140792 with different SCEV expansion costs.
| 
          
 ✅ With the latest revision this PR passed the C/C++ code formatter.  | 
    
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Should this use getExitCount() instead of getBackedgeTakenCount()? Peeling shouldn't care if there's an early exit from some other block.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yep, let me take a look at that separately, thanks!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Put up a separate patch: #141247
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM
Remove the restriction that the loop must be known to execute at least 2 iterations when peeling the last iteration. If we cannot prove at least 2 iterations are executed, a check and branch to skip the peeled loop is inserted.
…g last. (#140792) Remove the restriction that the loop must be known to execute at least 2 iterations when peeling the last iteration. If we cannot prove at least 2 iterations are executed, a check and branch to skip the peeled loop is inserted. PR: llvm/llvm-project#140792
| 
           I think this breaks bot for clang bootstrapping: https://green.lab.llvm.org/job/llvm.org/job/clang-stage2-Rthinlto/787/ Can you take a look to see if this indeed caused by your change?  | 
    
| 
           I also bisected a failure down to this revision, noted when building ffmpeg. (It is also possible to reproduce with the ffmpeg build integration in llvm-test-suite, but only if building in  It can be reproduced with this reduced case: int fastaudio_decode_avctx_0, fastaudio_decode_i;
void fastaudio_decode() {
  float m_2;
  for (int index5;; fastaudio_decode_i++) {
    for (int j = 0; j < 21; j++)
      if (j == 20)
        index5 = index5 % 2;
    m_2 = fastaudio_decode_avctx_0;
  }
}$ clang -target x86_64-linux-gnu -c -O2 repro.c -g
clang: ../lib/IR/Instruction.cpp:161: void llvm::Instruction::insertBefore(llvm:
:BasicBlock&, llvm::iplist_impl<llvm::simple_ilist<llvm::Instruction, llvm::ilis
t_iterator_bits<true>, llvm::ilist_parent<llvm::BasicBlock> >, llvm::SymbolTable
ListTraits<llvm::Instruction, llvm::ilist_iterator_bits<true>, llvm::ilist_paren
t<llvm::BasicBlock> > >::iterator): Assertion `!isa<PHINode>(this) && "Inserting
 PHI after debug-records!"' failed. | 
    
| 
           Thanks, this should be fixed by setting the insert point to before the first non-phi: ac9a466  | 
    
| 
           We've seen a failure while compiling  Jinx!  | 
    
| 
           This commit causes a regression. I can't build CMake-3.30.5 (the version recommended by spack, the   | 
    
| 
           Could you share a reproducer?  | 
    
| 
           reverted for now in 5804545, if you could still share a reproducer that would be great  | 
    
Adds extra test coverage for #140792.
…inductions Adds extra test coverage for llvm/llvm-project#140792.
…last. (#140792)" This reverts commit 5804545. The recommitted version contains an extra check to not peel if the latch exit is controlled by a pointer induction. Original message: Remove the restriction that the loop must be known to execute at least 2 iterations when peeling the last iteration. If we cannot prove at least 2 iterations are executed, a check and branch to skip the peeled loop is inserted. PR: #140792
…en peeling last. (#140792)" This reverts commit 5804545. The recommitted version contains an extra check to not peel if the latch exit is controlled by a pointer induction. Original message: Remove the restriction that the loop must be known to execute at least 2 iterations when peeling the last iteration. If we cannot prove at least 2 iterations are executed, a check and branch to skip the peeled loop is inserted. PR: llvm/llvm-project#140792
| 
           Recommitted in f98bdd9, with skipping loops where the latch exit is controlled by a pointer IV which caused the crash  | 
    
          
 Hi @fhahn our CI is all green this morning (and this includes building the specific version of CMake) and the overnight testing have picked up your commit, so we can assume your fix has worked. Thank you.  | 
    
| 
           Sent #142943 to revert the commit (+1 dependent commit) in case there's no reasonably fast and safe fix.  | 
    
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The root cause is in CodeGenPrepare: #142949
Remove the restriction that the loop must be known to execute at least 2 iterations when peeling the last iteration. If we cannot prove at least 2 iterations are executed, a check and branch to skip the peeled loop is inserted.