llvm · jdenny-ornl · Oct 2, 2025 · Mar 19, 2025 · Mar 26, 2025 · Apr 4, 2025
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -7866,6 +7866,17 @@ The attributes in this metadata is added to all followup loops of the
 loop distribution pass. See
 :ref:`Transformation Metadata <transformation-metadata>` for details.
 
+'``llvm.loop.estimated_trip_count``' Metadata
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This metadata records the loop's estimated trip count.  If it is not present, a
+loop's estimated trip count should be computed from any ``branch_weights``
+metadata attached to the latch block's branch instruction.
+
+Thus, this metadata frees loop transformations to compute latch branch weights
+solely for the purpose of maintaining accurate block frequencies instead of
+requiring the branch weights to always serve both roles.
+
 '``llvm.licm.disable``' Metadata
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h
@@ -315,7 +315,8 @@ TransformationMode hasLICMVersioningTransformation(const Loop *L);
 void addStringMetadataToLoop(Loop *TheLoop, const char *MDString,
                              unsigned V = 0);
 
-/// Returns a loop's estimated trip count based on branch weight metadata.
+/// Returns a loop's estimated trip count based on
+/// llvm.loop.estimated_trip_count metadata or, if none, branch weight metadata.
 /// In addition if \p EstimatedLoopInvocationWeight is not null it is
 /// initialized with weight of loop's latch leading to the exit.
 /// Returns a valid positive trip count, saturated at UINT_MAX, or std::nullopt
@@ -324,13 +325,21 @@ std::optional<unsigned>
 getLoopEstimatedTripCount(Loop *L,
                           unsigned *EstimatedLoopInvocationWeight = nullptr);
 
-/// Set a loop's branch weight metadata to reflect that loop has \p
-/// EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight exits
-/// through latch. Returns true if metadata is successfully updated, false
-/// otherwise. Note that loop must have a latch block which controls loop exit
-/// in order to succeed.
-bool setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
-                               unsigned EstimatedLoopInvocationWeight);
+/// Set a loop's llvm.loop.estimated_trip_count metadata and, if \p
+/// EstimatedLoopInvocationWeight, branch weight metadata to reflect that loop
+/// has \p EstimatedTripCount iterations and \p EstimatedLoopInvocationWeight
+/// exit weight through latch. Returns true if metadata is successfully updated,
+/// false otherwise. Note that loop must have a latch block which controls loop
+/// exit in order to succeed.
+///
+/// The use case for not setting branch weight metadata is when the original
+/// branch weight metadata is correct for computing block frequencies but the
+/// trip count has changed due to a loop transformation.  The branch weight
+/// metadata cannot be adjusted to reflect the new trip count, so we store the
+/// new trip count separately.
+bool setLoopEstimatedTripCount(
+    Loop *L, unsigned EstimatedTripCount,
+    std::optional<unsigned> EstimatedLoopInvocationWeight);
 
 /// Check inner loop (L) backedge count is known to be invariant on all
 /// iterations of its outer loop. If the loop has no parent, this is trivially

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -651,84 +651,6 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   }
 }
 
-struct WeightInfo {
-  // Weights for current iteration.
-  SmallVector<uint32_t> Weights;
-  // Weights to subtract after each iteration.
-  const SmallVector<uint32_t> SubWeights;
-};
-
-/// Update the branch weights of an exiting block of a peeled-off loop
-/// iteration.
-/// Let F is a weight of the edge to continue (fallthrough) into the loop.
-/// Let E is a weight of the edge to an exit.
-/// F/(F+E) is a probability to go to loop and E/(F+E) is a probability to
-/// go to exit.
-/// Then, Estimated ExitCount = F / E.
-/// For I-th (counting from 0) peeled off iteration we set the weights for
-/// the peeled exit as (EC - I, 1). It gives us reasonable distribution,
-/// The probability to go to exit 1/(EC-I) increases. At the same time
-/// the estimated exit count in the remainder loop reduces by I.
-/// To avoid dealing with division rounding we can just multiple both part
-/// of weights to E and use weight as (F - I * E, E).
-static void updateBranchWeights(Instruction *Term, WeightInfo &Info) {
-  setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
-  for (auto [Idx, SubWeight] : enumerate(Info.SubWeights))
-    if (SubWeight != 0)
-      // Don't set the probability of taking the edge from latch to loop header
-      // to less than 1:1 ratio (meaning Weight should not be lower than
-      // SubWeight), as this could significantly reduce the loop's hotness,
-      // which would be incorrect in the case of underestimating the trip count.
-      Info.Weights[Idx] =
-          Info.Weights[Idx] > SubWeight
-              ? std::max(Info.Weights[Idx] - SubWeight, SubWeight)
-              : SubWeight;
-}
-
-/// Initialize the weights for all exiting blocks.
-static void initBranchWeights(DenseMap<Instruction *, WeightInfo> &WeightInfos,
-                              Loop *L) {
-  SmallVector<BasicBlock *> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-  for (BasicBlock *ExitingBlock : ExitingBlocks) {
-    Instruction *Term = ExitingBlock->getTerminator();
-    SmallVector<uint32_t> Weights;
-    if (!extractBranchWeights(*Term, Weights))
-      continue;
-
-    // See the comment on updateBranchWeights() for an explanation of what we
-    // do here.
-    uint32_t FallThroughWeights = 0;
-    uint32_t ExitWeights = 0;
-    for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
-      if (L->contains(Succ))
-        FallThroughWeights += Weight;
-      else
-        ExitWeights += Weight;
-    }
-
-    // Don't try to update weights for degenerate case.
-    if (FallThroughWeights == 0)
-      continue;
-
-    SmallVector<uint32_t> SubWeights;
-    for (auto [Succ, Weight] : zip(successors(Term), Weights)) {
-      if (!L->contains(Succ)) {
-        // Exit weights stay the same.
-        SubWeights.push_back(0);
-        continue;
-      }
-
-      // Subtract exit weights on each iteration, distributed across all
-      // fallthrough edges.
-      double W = (double)Weight / (double)FallThroughWeights;
-      SubWeights.push_back((uint32_t)(ExitWeights * W));
-    }
-
-    WeightInfos.insert({Term, {std::move(Weights), std::move(SubWeights)}});
-  }
-}
-
 /// Clones the body of the loop L, putting it between \p InsertTop and \p
 /// InsertBot.
 /// \param IterNumber The serial number of the iteration currently being
@@ -1002,11 +924,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
   Instruction *LatchTerm =
       cast<Instruction>(cast<BasicBlock>(Latch)->getTerminator());
 
-  // If we have branch weight information, we'll want to update it for the
-  // newly created branches.
-  DenseMap<Instruction *, WeightInfo> Weights;
-  initBranchWeights(Weights, L);
-
   // Identify what noalias metadata is inside the loop: if it is inside the
   // loop, the associated metadata must be cloned for each iteration.
   SmallVector<MDNode *, 6> LoopLocalNoAliasDeclScopes;
@@ -1034,11 +951,6 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     assert(DT.verify(DominatorTree::VerificationLevel::Fast));
 #endif
 
-    for (auto &[Term, Info] : Weights) {
-      auto *TermCopy = cast<Instruction>(VMap[Term]);
-      updateBranchWeights(TermCopy, Info);
-    }
-
     // Remove Loop metadata from the latch branch instruction
     // because it is not the Loop's latch branch anymore.
     auto *LatchTermCopy = cast<Instruction>(VMap[LatchTerm]);
@@ -1064,15 +976,62 @@ bool llvm::peelLoop(Loop *L, unsigned PeelCount, LoopInfo *LI,
     PHI->setIncomingValueForBlock(NewPreHeader, NewVal);
   }
 
-  for (const auto &[Term, Info] : Weights) {
-    setBranchWeights(*Term, Info.Weights, /*IsExpected=*/false);
-  }
-
   // Update Metadata for count of peeled off iterations.
   unsigned AlreadyPeeled = 0;
   if (auto Peeled = getOptionalIntLoopAttribute(L, PeeledCountMetaData))
     AlreadyPeeled = *Peeled;
-  addStringMetadataToLoop(L, PeeledCountMetaData, AlreadyPeeled + PeelCount);
+  unsigned TotalPeeled = AlreadyPeeled + PeelCount;
+  addStringMetadataToLoop(L, PeeledCountMetaData, TotalPeeled);
+
+  // Update metadata for the estimated trip count.  The original branch weight
+  // metadata is already correct for both the remaining loop and the peeled loop
+  // iterations, so don't adjust it.
+  //
+  // For example, consider what happens when peeling 2 iterations from a loop
+  // with an estimated trip count of 10 and inserting them before the remaining
+  // loop.  Each of the peeled iterations and each iteration in the remaining
+  // loop still has the same probability of exiting the *entire original* loop
+  // as it did when in the original loop, and thus it should still have the same
+  // branch weights.  The peeled iterations' non-zero probabilities of exiting
+  // already appropriately reduce the probability of reaching the remaining
+  // iterations just as they did in the original loop.  Trying to also adjust
+  // the remaining loop's branch weights to reflect its new trip count of 8 will
+  // erroneously further reduce its block frequencies.  However, in case an
+  // analysis later needs to determine the trip count of the remaining loop
+  // while examining it in isolation without considering the probability of
+  // actually reaching it, we store the new trip count as separate metadata.
+  //
+  // FIXME: getLoopEstimatedTripCount and setLoopEstimatedTripCount skip loops
+  // that don't match the restrictions of getExpectedExitLoopLatchBranch in
+  // LoopUtils.cpp.  For example,
+  // llvm/tests/Transforms/LoopUnroll/peel-branch-weights.ll (introduced by
+  // b43a4d0850d5) has multiple exits.  Should we try to extend them to handle
+  // such cases?  For now, we just don't try to record
+  // llvm.loop.estimated_trip_count for such cases, so the original branch
+  // weights will have to do.
+  if (auto EstimatedTripCount = getLoopEstimatedTripCount(L)) {
+    // FIXME: The previous updateBranchWeights implementation had this
+    // comment:
+    //
+    //   Don't set the probability of taking the edge from latch to loop header
+    //   to less than 1:1 ratio (meaning Weight should not be lower than
+    //   SubWeight), as this could significantly reduce the loop's hotness,
+    //   which would be incorrect in the case of underestimating the trip count.
+    //
+    // See e8d5db206c2f commit log for further discussion.  That seems to
+    // suggest that we should avoid ever setting a trip count of < 2 here
+    // (equal chance of continuing and exiting means the loop will likely
+    // continue once and then exit once).  Or is keeping the original branch
+    // weights already a sufficient improvement for whatever analysis cares
+    // about this case?
+    unsigned EstimatedTripCountNew = *EstimatedTripCount;
+    if (EstimatedTripCountNew < TotalPeeled) // FIXME: TotalPeeled + 2?
+      EstimatedTripCountNew = 0;             // FIXME: = 2?
+    else
+      EstimatedTripCountNew -= TotalPeeled;
+    setLoopEstimatedTripCount(L, EstimatedTripCountNew,
+                              /*EstimatedLoopInvocationWeight=*/std::nullopt);
+  }
 
   if (Loop *ParentLoop = L->getParentLoop())
     L = ParentLoop;

diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -53,6 +53,8 @@ using namespace llvm::PatternMatch;
 
 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
 static const char *LLVMLoopDisableLICM = "llvm.licm.disable";
+static const char *LLVMLoopEstimatedTripCount =
+    "llvm.loop.estimated_trip_count";
 
 bool llvm::formDedicatedExitBlocks(Loop *L, DominatorTree *DT, LoopInfo *LI,
                                    MemorySSAUpdater *MSSAU,
@@ -863,27 +865,39 @@ llvm::getLoopEstimatedTripCount(Loop *L,
             getEstimatedTripCount(LatchBranch, L, ExitWeight)) {
       if (EstimatedLoopInvocationWeight)
         *EstimatedLoopInvocationWeight = ExitWeight;
+      // FIXME: Where else are branch weights directly used for estimating loop
+      // trip counts?  They should also be updated to use
+      // LLVMLoopEstimatedTripCount when present... or to just call this
+      // function.
+      if (auto EstimatedTripCount =
+              getOptionalIntLoopAttribute(L, LLVMLoopEstimatedTripCount))
+        return EstimatedTripCount;
       return *EstTripCount;
     }
   }
   return std::nullopt;
 }
 
-bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
-                                     unsigned EstimatedloopInvocationWeight) {
+bool llvm::setLoopEstimatedTripCount(
+    Loop *L, unsigned EstimatedTripCount,
+    std::optional<unsigned> EstimatedloopInvocationWeight) {
   // At the moment, we currently support changing the estimate trip count of
   // the latch branch only.  We could extend this API to manipulate estimated
   // trip counts for any exit.
   BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
   if (!LatchBranch)
     return false;
 
+  addStringMetadataToLoop(L, LLVMLoopEstimatedTripCount, EstimatedTripCount);
+  if (!EstimatedloopInvocationWeight)
+    return true;
+
   // Calculate taken and exit weights.
   unsigned LatchExitWeight = 0;
   unsigned BackedgeTakenWeight = 0;
 
   if (EstimatedTripCount > 0) {
-    LatchExitWeight = EstimatedloopInvocationWeight;
+    LatchExitWeight = *EstimatedloopInvocationWeight;
     BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
   }
 

diff --git a/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll b/llvm/test/Transforms/LoopUnroll/peel-branch-weights-freq.ll
@@ -0,0 +1,75 @@
+; Test branch weight metadata, estimated trip count metadata, and block
+; frequencies after loop peeling.
+
+; RUN: opt < %s -S -passes='print<block-freq>' 2>&1 | \
+; RUN:   FileCheck -check-prefix=CHECK %s
+
+; The -implicit-check-not options make sure that no additional labels or calls
+; to @f show up.
+; RUN: opt < %s -S -passes='loop-unroll,print<block-freq>' \
+; RUN:     -unroll-force-peel-count=2 2>&1 | \
+; RUN:   FileCheck %s -check-prefix=CHECK-UR \
+; RUN:       -implicit-check-not='{{^[^ ;]*:}}' \
+; RUN:       -implicit-check-not='call void @f'
+
+; CHECK: block-frequency-info: test
+; CHECK: do.body: float = 10.0,
+
+; The sum should still be ~10.
+;
+; CHECK-UR: block-frequency-info: test
+; CHECK-UR: - [[DO_BODY_PEEL:.*]]: float = 1.0,
+; CHECK-UR: - [[DO_BODY_PEEL2:.*]]: float = 0.9,
+; CHECK-UR: - [[DO_BODY:.*]]: float = 8.1,
+
+declare void @f(i32)
+
+define void @test(i32 %n) {
+; CHECK-UR-LABEL: define void @test(
+;       CHECK-UR: [[ENTRY:.*]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL_BEGIN:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL_BEGIN]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END:.*]], label %[[DO_BODY_PEEL_NEXT:.*]], !prof ![[#PROF:]]
+;       CHECK-UR: [[DO_BODY_PEEL_NEXT]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL2:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL2]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END]], label %[[DO_BODY_PEEL_NEXT1:.*]], !prof ![[#PROF]]
+;       CHECK-UR: [[DO_BODY_PEEL_NEXT1]]:
+;       CHECK-UR:   br label %[[DO_BODY_PEEL_NEXT5:.*]]
+;       CHECK-UR: [[DO_BODY_PEEL_NEXT5]]:
+;       CHECK-UR:   br label %[[ENTRY_PEEL_NEWPH:.*]]
+;       CHECK-UR: [[ENTRY_PEEL_NEWPH]]:
+;       CHECK-UR:   br label %[[DO_BODY]]
+;       CHECK-UR: [[DO_BODY]]:
+;       CHECK-UR:   call void @f
+;       CHECK-UR:   br i1 %{{.*}}, label %[[DO_END_LOOPEXIT:.*]], label %[[DO_BODY]], !prof ![[#PROF]], !llvm.loop ![[#LOOP_UR_LATCH:]]
+;       CHECK-UR: [[DO_END_LOOPEXIT]]:
+;       CHECK-UR:   br label %[[DO_END]]
+;       CHECK-UR: [[DO_END]]:
+;       CHECK-UR:   ret void
+
+entry:
+  br label %do.body
+
+do.body:
+  %i = phi i32 [ 0, %entry ], [ %inc, %do.body ]
+  %inc = add i32 %i, 1
+  call void @f(i32 %i)
+  %c = icmp sge i32 %inc, %n
+  br i1 %c, label %do.end, label %do.body, !prof !0
+
+do.end:
+  ret void
+}
+
+!0 = !{!"branch_weights", i32 1, i32 9}
+
+; CHECK-UR: ![[#PROF]] = !{!"branch_weights", i32 1, i32 9}
+; CHECK-UR: ![[#LOOP_UR_LATCH]] = distinct !{![[#LOOP_UR_LATCH]], ![[#LOOP_UR_PC:]], ![[#LOOP_UR_TC:]], ![[#DISABLE:]]}
+; CHECK-UR: ![[#LOOP_UR_PC]] = !{!"llvm.loop.peeled.count", i32 2}
+; CHECK-UR: ![[#LOOP_UR_TC]] = !{!"llvm.loop.estimated_trip_count", i32 8}
+; CHECK-UR: ![[#DISABLE]] = !{!"llvm.loop.unroll.disable"}