diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 953748f07af02..a8bb2dd0e621d 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -82,11 +82,10 @@ static cl::opt<unsigned>
     UnrollThreshold("unroll-threshold", cl::Hidden,
                     cl::desc("The cost threshold for loop unrolling"));
 
-static cl::opt<unsigned>
-    UnrollOptSizeThreshold(
-      "unroll-optsize-threshold", cl::init(0), cl::Hidden,
-      cl::desc("The cost threshold for loop unrolling when optimizing for "
-               "size"));
+static cl::opt<unsigned> UnrollOptSizeThreshold(
+    "unroll-optsize-threshold", cl::init(0), cl::Hidden,
+    cl::desc("The cost threshold for loop unrolling when optimizing for "
+             "size"));
 
 static cl::opt<unsigned> UnrollPartialThreshold(
     "unroll-partial-threshold", cl::Hidden,
@@ -151,9 +150,9 @@ static cl::opt<unsigned> FlatLoopTripCountThreshold(
              "threshold, the loop is considered as flat and will be less "
              "aggressively unrolled."));
 
-static cl::opt<bool> UnrollUnrollRemainder(
-  "unroll-remainder", cl::Hidden,
-  cl::desc("Allow the loop remainder to be unrolled."));
+static cl::opt<bool>
+    UnrollUnrollRemainder("unroll-remainder", cl::Hidden,
+                          cl::desc("Allow the loop remainder to be unrolled."));
 
 // This option isn't ever intended to be enabled, it serves to allow
 // experiments to check the assumptions about when this kind of revisit is
@@ -336,8 +335,259 @@ struct PragmaInfo {
   const bool PragmaEnableUnroll;
 };
 
+/// Helper type to estimate per-iteration cost savings coming from fully
+/// unrolling a loop.
+///
+/// The analysis maintains a set of foldable instructions inside the loop (i.e.,
+/// instructions whose result will be statically known after loop unrolling)
+/// that we assume will be entirely removable if the loop is fully unrolled.
+/// These instructions' cost can be deducted from the unrolled cost when
+/// comparing against a threshold.
+struct FullUnrollCostSavings {
+  FullUnrollCostSavings(const Loop *L) : L(L) {}
+
+  /// Returns whether the instruction is foldable.
+  inline bool isFoldable(const Instruction *I) const {
+    return Foldable.contains(I);
+  }
+
+  /// If the value is an instruction, returns whether that instruction is
+  /// foldable, false otherwise.
+  bool isFoldable(const Value *V) const {
+    if (const Instruction *I = dyn_cast<Instruction>(V))
+      return isFoldable(I);
+    return false;
+  }
+
+  /// Adds an instruction to the foldable set and re-evaluates instructions in
+  /// the loop to determine whether they are now foldable.
+  void addFoldable(const Instruction *I) {
+    if (!Foldable.insert(I).second)
+      return;
+
+    // Every time we assume foldability of an additional instruction, we
+    // potentially need to revisit instructions that were previously seen as
+    // unfoldable.
+    Evaluated.clear();
+
+    addUsersToExploreSet(I);
+    while (ToEvaluate.size())
+      evalInstruction(ToEvaluate.pop_back_val());
+  }
+
+  /// Returns savings incurred by all foldable instructions, according to the \p
+  /// TTI.
+  InstructionCost computeSavings(const TargetTransformInfo &TTI) const {
+    TargetTransformInfo::TargetCostKind CostKind =
+        L->getHeader()->getParent()->hasMinSize()
+            ? TargetTransformInfo::TCK_CodeSize
+            : TargetTransformInfo::TCK_SizeAndLatency;
+
+    InstructionCost CostSavings;
+    for (const Value *Val : Foldable)
+      CostSavings += TTI.getInstructionCost(cast<Instruction>(Val), CostKind);
+    return CostSavings;
+  }
+
+private:
+  /// The set of instruction inside the loop which we consider foldable.
+  SmallPtrSet<const Instruction *, 4> Foldable;
+  /// Caches the set of instructions we have already evaluated when adding a new
+  /// instruction to the foldable set.
+  SmallPtrSet<const Instruction *, 4> Evaluated;
+  /// Stack of instructions to evaluate when adding a new instruction to the
+  /// foldable set.
+  SmallVector<const Instruction *, 4> ToEvaluate;
+  /// The loop under consideration.
+  const Loop *L;
+
+  /// Adds all value users to the stack of instructions to evaluate, if they
+  /// have not been evaluated already.
+  void addUsersToExploreSet(const Value *Val) {
+    for (const User *U : Val->users()) {
+      if (const Instruction *I = dyn_cast<Instruction>(U))
+        if (!Evaluated.contains(I))
+          ToEvaluate.push_back(I);
+    }
+  }
+
+  /// Evaluates an instruction to determine whether it is foldable, and returns
+  /// if that is the case. This may recurse on operands that are the result of
+  /// yet unevaluated instructions inside the loop.
+  bool evalInstruction(const Instruction *I) {
+    Evaluated.insert(I);
+    if (isFoldable(I))
+      return true;
+    if (I->mayHaveSideEffects() || I->isTerminator() || isa<PHINode>(I))
+      return false;
+    bool IsFoldable;
+    if (isa<SelectInst>(I)) {
+      // Special case a select instruction; if the select operand is constant
+      // the result equals one of the other operands so the instruction is
+      // foldable.
+      IsFoldable = valWillBeConstant(I->getOperand(0));
+    } else {
+      IsFoldable = true;
+      // All instruction operands must end up as constants for the instruction
+      // to be foldable.
+      for (const Value *Val : I->operand_values()) {
+        if (!valWillBeConstant(Val)) {
+          IsFoldable = false;
+          break;
+        }
+      }
+    }
+    if (IsFoldable) {
+      Foldable.insert(I);
+      addUsersToExploreSet(I);
+    }
+    return IsFoldable;
+  }
+
+  bool valWillBeConstant(const Value *Val) {
+    if (isa<Constant>(Val) || isFoldable(Val))
+      return true;
+    const Instruction *ValInstr = dyn_cast<Instruction>(Val);
+    if (!ValInstr || Evaluated.contains(ValInstr) || !L->contains(ValInstr))
+      return false;
+    return evalInstruction(ValInstr);
+  }
+};
+
 } // end anonymous namespace
 
+/// Runs a fast analysis on the loop to determine whether it is worth it to
+/// fully unroll it. As opposed to analyzeLoopUnrollCost, this does not attempt
+/// to simulate execution of every loop iteration but instead tries to identify
+/// the set of instructions that will be optimizable away if the loop is fully
+/// unrolled. Returns estimated instruction cost savings per loop iteration if
+/// the loop were to be fully unrolled according to the trip count in UP.Count.
+static InstructionCost analyzeFullUnrollCostSavings(
+    const Loop *L, ScalarEvolution &SE, const TargetTransformInfo &TTI,
+    const TargetTransformInfo::UnrollingPreferences &UP) {
+  // Cost savings analysis is all based on unrolling making some instructions
+  // foldable; if we cannot identify the loop's IV then there is nothing we can
+  // do.
+  PHINode *IV = L->getInductionVariable(SE);
+  if (!IV)
+    return {};
+  FullUnrollCostSavings Savings(L);
+
+  // If we were to unroll the loop, everything that is only dependent on the IV
+  // and constants will get simplified away.
+  Savings.addFoldable(IV);
+
+  // Look for subloops whose trip count would go from runtime-dependent to
+  // runtime-independent if we were to unroll the loop. These subloops are
+  // likely to be fully unrollable in the future and yield further cost savings.
+  unsigned NumUnrollableSubloops = 0;
+  for (const Loop *SubLoop : L->getSubLoops()) {
+    // We must be able to determine the loop's IV, initial/final IV value, and
+    // step.
+    PHINode *SubIV = SubLoop->getInductionVariable(SE);
+    if (!SubIV)
+      continue;
+    std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
+    if (!Bounds)
+      continue;
+    Value *StepVal = Bounds->getStepValue();
+    if (!StepVal)
+      continue;
+
+    bool SubBoundsDependsOnIV = false;
+    auto IsValKnown = [&](const Value *Val) -> bool {
+      if (isa<Constant>(Val))
+        return true;
+      if (Savings.isFoldable(Val)) {
+        SubBoundsDependsOnIV = true;
+        return true;
+      }
+      return false;
+    };
+
+    // Determine whether the derivation of the subloop's bounds depends
+    // exclusively on constants and the outer loop's IV.
+    if (IsValKnown(&Bounds->getInitialIVValue()) &&
+        IsValKnown(&Bounds->getFinalIVValue()) && IsValKnown(StepVal) &&
+        SubBoundsDependsOnIV) {
+      // Optimistically assume that we will be able to unroll the subloop in the
+      // future, which means that its IV will also be known on all inner loop
+      // iterations, leading to more instructions being optimized away. Properly
+      // estimating the cost savings per outer loop iteration would require us
+      // to estimate the average subloop trip count, but it is too complicated
+      // for this analysis. When determining cost savings, we will very
+      // conservatively assume that the inner loop will only execute once per
+      // outer loop iteration. This also reduces our cost savings estimation
+      // mistake in the case where the subloop does not end up being unrolled.
+      Savings.addFoldable(SubIV);
+      ++NumUnrollableSubloops;
+
+      LLVM_DEBUG({
+        dbgs() << "  Trip count of subloop ";
+        SubLoop->getHeader()->printAsOperand(dbgs(), false);
+        dbgs() << " will become runtime-independent by fully unrolling loop ";
+        L->getHeader()->printAsOperand(dbgs(), false);
+        dbgs() << '\n';
+      });
+    }
+  }
+
+  // Look for condititional branches whose condition would be statically
+  // determined at each iteration of the loop if it were unrolled. In some
+  // cases, this means we will able to remove the branch entirely.
+  for (const BasicBlock *BB : L->getBlocks()) {
+    const Instruction *TermInstr = BB->getTerminator();
+    if (const BranchInst *Br = dyn_cast<BranchInst>(TermInstr)) {
+      if (Br->isConditional() && Savings.isFoldable(Br->getCondition())) {
+        // The branch condition will be statically determined at each iteration
+        // of the loop.
+        BasicBlock *FalseSucc = Br->getSuccessor(0),
+                   *TrueSucc = Br->getSuccessor(1);
+
+        // Checks whether one of the branch successor has at most two
+        // predecessors which are either the branch's block or the other branch
+        // successor.
+        auto IsIfThen = [&](auto Predecessors, BasicBlock *OtherSucc) -> bool {
+          unsigned NumPreds = 0;
+          for (const BasicBlock *Pred : Predecessors) {
+            if (Pred != BB && Pred != OtherSucc)
+              return false;
+            if (++NumPreds > 2)
+              return false;
+          }
+          return true;
+        };
+
+        if ((TrueSucc->getSinglePredecessor() ||
+             IsIfThen(predecessors(TrueSucc), FalseSucc)) &&
+            (FalseSucc->getSinglePredecessor() ||
+             IsIfThen(predecessors(FalseSucc), TrueSucc))) {
+          // The CFG corresponds to a simple if/then(/else) construct whose
+          // condition we will know, so we will able to remove the branch and
+          // one of the two blocks at each iteration of the outer loop. Only the
+          // branch represents a cost saving, since one successor block will
+          // still be executed.
+          Savings.addFoldable(Br);
+          LLVM_DEBUG({
+            dbgs() << "  Conditional branch will be removed by fully "
+                      "unrolling loop ";
+            L->getHeader()->printAsOperand(dbgs(), false);
+            dbgs() << '\n';
+          });
+        }
+      }
+    }
+  }
+
+  // Compute cost savings from instructions that will likely be optimized away
+  // by unrolling the loop.
+  InstructionCost CostSavings = Savings.computeSavings(TTI);
+  // Finally, for each subloop that we think will become unrollable, account for
+  // the backedge's branch being removed.
+  CostSavings += NumUnrollableSubloops;
+  return CostSavings;
+}
+
 /// Figure out if the loop is worth full unrolling.
 ///
 /// Complete loop unrolling can make some loads constant, and we need to know
@@ -832,34 +1082,54 @@ shouldPragmaUnroll(Loop *L, const PragmaInfo &PInfo,
   return std::nullopt;
 }
 
-static std::optional<unsigned> shouldFullUnroll(
-    Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
-    ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
-    const unsigned FullUnrollTripCount, const UnrollCostEstimator UCE,
-    const TargetTransformInfo::UnrollingPreferences &UP) {
-  assert(FullUnrollTripCount && "should be non-zero!");
+static bool
+shouldFullUnroll(Loop *L, const TargetTransformInfo &TTI, DominatorTree &DT,
+                 ScalarEvolution &SE,
+                 const SmallPtrSetImpl<const Value *> &EphValues,
+                 const UnrollCostEstimator UCE,
+                 const TargetTransformInfo::UnrollingPreferences &UP) {
+  assert(UP.Count && "should be non-zero!");
 
-  if (FullUnrollTripCount > UP.FullUnrollMaxCount)
-    return std::nullopt;
+  if (UP.Count > UP.FullUnrollMaxCount)
+    return false;
 
   // When computing the unrolled size, note that BEInsns are not replicated
   // like the rest of the loop body.
   if (UCE.getUnrolledLoopSize(UP) < UP.Threshold)
-    return FullUnrollTripCount;
+    return true;
 
   // The loop isn't that small, but we still can fully unroll it if that
-  // helps to remove a significant number of instructions.
-  // To check that, run additional analysis on the loop.
+  // helps to remove a significant number of instructions. To check that, run
+  // additional analyses on the loop. First try a full iteration-by-iteration
+  // analysis on the loop. If that fails, run a simpler structural analysis that
+  // estimates per-iteration cost savings in the unrolled loop.
   if (std::optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
-          L, FullUnrollTripCount, DT, SE, EphValues, TTI,
+          L, UP.Count, DT, SE, EphValues, TTI,
           UP.Threshold * UP.MaxPercentThresholdBoost / 100,
           UP.MaxIterationsCountToAnalyze)) {
     unsigned Boost =
-      getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
+        getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
     if (Cost->UnrolledCost < UP.Threshold * Boost / 100)
-      return FullUnrollTripCount;
+      return true;
+  } else {
+    InstructionCost Savings = analyzeFullUnrollCostSavings(L, SE, TTI, UP);
+    if (!Savings.isValid() || !*Savings.getValue())
+      return false;
+    // Savings for one loop iteration are those estimated by the analaysis plus
+    // the loop backedge's branch.
+    uint64_t ItSavings = *Savings.getValue() + 1;
+    // Compute estimated cost of one loop iteration in the unrolled form.
+    uint64_t ItUnrollCost = UCE.getRolledLoopSize();
+    if (ItSavings < ItUnrollCost)
+      ItUnrollCost -= ItSavings;
+    else
+      ItUnrollCost = 1;
+    uint64_t FullUnrollCost = ItUnrollCost * UP.Count + 1;
+    assert(FullUnrollCost && "loop has no cost");
+    if (FullUnrollCost < UP.Threshold)
+      return true;
   }
-  return std::nullopt;
+  return false;
 }
 
 static std::optional<unsigned>
@@ -872,7 +1142,7 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
 
   if (!UP.Partial) {
     LLVM_DEBUG(dbgs() << "  will not try to unroll partially because "
-               << "-unroll-allow-partial not given\n");
+                      << "-unroll-allow-partial not given\n");
     return 0;
   }
   unsigned count = UP.Count;
@@ -882,7 +1152,7 @@ shouldPartialUnroll(const unsigned LoopSize, const unsigned TripCount,
     // Reduce unroll count to be modulo of TripCount for partial unrolling.
     if (UCE.getUnrolledLoopSize(UP, count) > UP.PartialThreshold)
       count = (std::max(UP.PartialThreshold, UP.BEInsns + 1) - UP.BEInsns) /
-        (LoopSize - UP.BEInsns);
+              (LoopSize - UP.BEInsns);
     if (count > UP.MaxCount)
       count = UP.MaxCount;
     while (count != 0 && TripCount % count != 0)
@@ -979,9 +1249,7 @@ bool llvm::computeUnrollCount(
   UP.Count = 0;
   if (TripCount) {
     UP.Count = TripCount;
-    if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
-                                             TripCount, UCE, UP)) {
-      UP.Count = *UnrollFactor;
+    if (shouldFullUnroll(L, TTI, DT, SE, EphValues, UCE, UP)) {
       UseUpperBound = false;
       return ExplicitUnroll;
     }
@@ -1002,9 +1270,7 @@ bool llvm::computeUnrollCount(
   if (!TripCount && MaxTripCount && (UP.UpperBound || MaxOrZero) &&
       MaxTripCount <= UP.MaxUpperBound) {
     UP.Count = MaxTripCount;
-    if (auto UnrollFactor = shouldFullUnroll(L, TTI, DT, SE, EphValues,
-                                             MaxTripCount, UCE, UP)) {
-      UP.Count = *UnrollFactor;
+    if (shouldFullUnroll(L, TTI, DT, SE, EphValues, UCE, UP)) {
       UseUpperBound = true;
       return ExplicitUnroll;
     }
diff --git a/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll b/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
index 556a4032b58e4..8f4f71abf37a9 100644
--- a/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
+++ b/llvm/test/Transforms/LoopUnroll/complete_unroll_profitability_with_assume.ll
@@ -22,55 +22,73 @@ define i32 @foo(ptr %a) {
 ; ANALYZE-FULL:       for.body:
 ; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE:%.*]], label [[FOR_NEXT:%.*]]
 ; ANALYZE-FULL:       do_store:
-; ANALYZE-FULL-NEXT:    store i32 0, ptr [[A:%.*]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA:%.*]] = load i32, ptr [[A:%.*]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL:%.*]] = mul i32 [[DATA]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL]], ptr [[A]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT]]
 ; ANALYZE-FULL:       for.next:
 ; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE_1:%.*]], label [[FOR_NEXT_1:%.*]]
 ; ANALYZE-FULL:       do_store.1:
 ; ANALYZE-FULL-NEXT:    [[GEP_1:%.*]] = getelementptr i32, ptr [[A]], i32 1
-; ANALYZE-FULL-NEXT:    store i32 1, ptr [[GEP_1]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_1:%.*]] = load i32, ptr [[GEP_1]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_1:%.*]] = mul i32 [[DATA_1]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_1]], ptr [[GEP_1]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_1]]
 ; ANALYZE-FULL:       for.next.1:
 ; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE_2:%.*]], label [[FOR_NEXT_2:%.*]]
 ; ANALYZE-FULL:       do_store.2:
 ; ANALYZE-FULL-NEXT:    [[GEP_2:%.*]] = getelementptr i32, ptr [[A]], i32 2
-; ANALYZE-FULL-NEXT:    store i32 2, ptr [[GEP_2]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_2:%.*]] = load i32, ptr [[GEP_2]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_2:%.*]] = mul i32 [[DATA_2]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_2]], ptr [[GEP_2]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_2]]
 ; ANALYZE-FULL:       for.next.2:
 ; ANALYZE-FULL-NEXT:    br i1 true, label [[DO_STORE_3:%.*]], label [[FOR_NEXT_3:%.*]]
 ; ANALYZE-FULL:       do_store.3:
 ; ANALYZE-FULL-NEXT:    [[GEP_3:%.*]] = getelementptr i32, ptr [[A]], i32 3
-; ANALYZE-FULL-NEXT:    store i32 3, ptr [[GEP_3]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_3:%.*]] = load i32, ptr [[GEP_3]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_3:%.*]] = mul i32 [[DATA_3]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_3]], ptr [[GEP_3]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_3]]
 ; ANALYZE-FULL:       for.next.3:
 ; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_4:%.*]], label [[FOR_NEXT_4:%.*]]
 ; ANALYZE-FULL:       do_store.4:
 ; ANALYZE-FULL-NEXT:    [[GEP_4:%.*]] = getelementptr i32, ptr [[A]], i32 4
-; ANALYZE-FULL-NEXT:    store i32 4, ptr [[GEP_4]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_4:%.*]] = load i32, ptr [[GEP_4]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_4:%.*]] = mul i32 [[DATA_4]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_4]], ptr [[GEP_4]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_4]]
 ; ANALYZE-FULL:       for.next.4:
 ; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_5:%.*]], label [[FOR_NEXT_5:%.*]]
 ; ANALYZE-FULL:       do_store.5:
 ; ANALYZE-FULL-NEXT:    [[GEP_5:%.*]] = getelementptr i32, ptr [[A]], i32 5
-; ANALYZE-FULL-NEXT:    store i32 5, ptr [[GEP_5]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_5:%.*]] = load i32, ptr [[GEP_5]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_5:%.*]] = mul i32 [[DATA_5]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_5]], ptr [[GEP_5]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_5]]
 ; ANALYZE-FULL:       for.next.5:
 ; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_6:%.*]], label [[FOR_NEXT_6:%.*]]
 ; ANALYZE-FULL:       do_store.6:
 ; ANALYZE-FULL-NEXT:    [[GEP_6:%.*]] = getelementptr i32, ptr [[A]], i32 6
-; ANALYZE-FULL-NEXT:    store i32 6, ptr [[GEP_6]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_6:%.*]] = load i32, ptr [[GEP_6]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_6:%.*]] = mul i32 [[DATA_6]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_6]], ptr [[GEP_6]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_6]]
 ; ANALYZE-FULL:       for.next.6:
 ; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_7:%.*]], label [[FOR_NEXT_7:%.*]]
 ; ANALYZE-FULL:       do_store.7:
 ; ANALYZE-FULL-NEXT:    [[GEP_7:%.*]] = getelementptr i32, ptr [[A]], i32 7
-; ANALYZE-FULL-NEXT:    store i32 7, ptr [[GEP_7]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_7:%.*]] = load i32, ptr [[GEP_7]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_7:%.*]] = mul i32 [[DATA_7]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_7]], ptr [[GEP_7]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_7]]
 ; ANALYZE-FULL:       for.next.7:
 ; ANALYZE-FULL-NEXT:    br i1 false, label [[DO_STORE_8:%.*]], label [[FOR_NEXT_8:%.*]]
 ; ANALYZE-FULL:       do_store.8:
 ; ANALYZE-FULL-NEXT:    [[GEP_8:%.*]] = getelementptr i32, ptr [[A]], i32 8
-; ANALYZE-FULL-NEXT:    store i32 8, ptr [[GEP_8]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_8:%.*]] = load i32, ptr [[GEP_8]], align 4
+; ANALYZE-FULL-NEXT:    [[DATA_MUL_8:%.*]] = mul i32 [[DATA_8]], 2
+; ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL_8]], ptr [[GEP_8]], align 4
 ; ANALYZE-FULL-NEXT:    br label [[FOR_NEXT_8]]
 ; ANALYZE-FULL:       for.next.8:
 ; ANALYZE-FULL-NEXT:    ret i32 9
@@ -87,7 +105,10 @@ define i32 @foo(ptr %a) {
 ; DONT-ANALYZE-FULL-NEXT:    br i1 [[CMP2]], label [[DO_STORE:%.*]], label [[FOR_NEXT]]
 ; DONT-ANALYZE-FULL:       do_store:
 ; DONT-ANALYZE-FULL-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 [[INDVAR]]
-; DONT-ANALYZE-FULL-NEXT:    store i32 [[INDVAR]], ptr [[GEP]], align 4
+; DONT-ANALYZE-FULL-NEXT:    [[DATA:%.*]] = load i32, ptr [[GEP]], align 4
+; DONT-ANALYZE-FULL-NEXT:    [[DATA_MUL:%.*]] = mul i32 [[DATA]], 2
+; DONT-ANALYZE-FULL-NEXT:    [[DATA_ADD:%.*]] = add i32 [[DATA_MUL]], 1
+; DONT-ANALYZE-FULL-NEXT:    store i32 [[DATA_MUL]], ptr [[GEP]], align 4
 ; DONT-ANALYZE-FULL-NEXT:    br label [[FOR_NEXT]]
 ; DONT-ANALYZE-FULL:       for.next:
 ; DONT-ANALYZE-FULL-NEXT:    [[EXITCOND:%.*]] = icmp ne i32 [[INDVAR_NEXT]], 9
@@ -108,7 +129,10 @@ for.body:
 
 do_store:
   %gep = getelementptr i32, ptr %a, i32 %indvar
-  store i32 %indvar, ptr %gep
+  %data = load i32, ptr %gep
+  %data_mul = mul i32 %data, 2
+  %data_add = add i32 %data_mul, 1
+  store i32 %data_mul, ptr %gep
   br label %for.next
 
 for.next:
diff --git a/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll b/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll
new file mode 100644
index 0000000000000..1658af6dd55b9
--- /dev/null
+++ b/llvm/test/Transforms/LoopUnroll/full-unroll-cost-savings.ll
@@ -0,0 +1,354 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=loop-unroll -unroll-threshold=25 < %s | FileCheck %s
+
+; All functions are simple variations of the same double nested loop with an
+; if/then/else-like CFG structure in the outer loop. The unrolling threshold is
+; set manually so that it is just slightly higher than the estimated unrolled
+; cost of the outer loop in the baseline, even after unroll cost savings
+; analysis.
+
+; Baseline. Inner loop's bounds and if/then/else's condition depend on function
+; arguments. No unrolling happens.
+
+define void @no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub, i32 noundef %ifcond) {
+; CHECK-LABEL: @no_fullunroll(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OUTER_IV_NEXT:%.*]], [[OUTER_LATCH_EXITING:%.*]] ]
+; CHECK-NEXT:    [[OUTER_IV_EXT:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK:       inner.header_latch_exiting:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ [[OUTER_IV]], [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT:    [[IDX_PART:%.*]] = mul nuw nsw i64 [[OUTER_IV_EXT]], 16
+; CHECK-NEXT:    [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT:    [[IDX:%.*]] = add nuw nsw i64 [[IDX_PART]], [[INNER_IV_EXT]]
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[IDX]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_IF:%.*]]
+; CHECK:       outer.if:
+; CHECK-NEXT:    [[IF_ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[OUTER_IV_EXT]]
+; CHECK-NEXT:    [[MOD2:%.*]] = and i32 [[IFCOND:%.*]], 1
+; CHECK-NEXT:    [[IF_COND:%.*]] = icmp ult i32 [[MOD2]], 0
+; CHECK-NEXT:    br i1 [[IF_COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 1, ptr [[IF_ADDR]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING]]
+; CHECK:       if.else:
+; CHECK-NEXT:    store i32 2, ptr [[IF_ADDR]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING]]
+; CHECK:       outer.latch_exiting:
+; CHECK-NEXT:    [[OUTER_IV_NEXT]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT:    [[OUTER_COND:%.*]] = icmp ult i32 [[OUTER_IV_NEXT]], 2
+; CHECK-NEXT:    br i1 [[OUTER_COND]], label [[OUTER_HEADER]], label [[END:%.*]]
+; CHECK:       end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if:                                                     ; preds = %inner.header_latch_exiting
+  %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+  %mod2 = and i32 %ifcond, 1
+  %if.cond = icmp ult i32 %mod2, 0
+  br i1 %if.cond, label %if.then, label %if.else
+
+if.then:                                                      ; preds = %outer.if
+  store i32 1, ptr %if.addr
+  br label %outer.latch_exiting
+
+if.else:                                                      ; preds = %outer.if
+  store i32 2, ptr %if.addr
+  br label %outer.latch_exiting
+
+outer.latch_exiting:                                          ; preds = %if.then, %if.else
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 2
+  br i1 %outer.cond, label %outer.header, label %end
+
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+; Inner loop's bounds depend on constants and outer IV, yielding extra cost
+; savings. These are enough to fully unroll the outer loop.
+
+define void @save_subloop(ptr noundef %mem, i32 noundef %ifcond) {
+; CHECK-LABEL: @save_subloop(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK:       inner.header_latch_exiting:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], 2
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING_1:%.*]]
+; CHECK:       outer.if:
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 1, ptr [[MEM]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    store i32 2, ptr [[MEM]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING]]
+; CHECK:       outer.latch_exiting:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK:       inner.header_latch_exiting.1:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT:    [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT:    [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT:    [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT:    [[INNER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], 2
+; CHECK-NEXT:    br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]]
+; CHECK:       outer.if.1:
+; CHECK-NEXT:    [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1
+; CHECK-NEXT:    br i1 false, label [[IF_THEN_1:%.*]], label [[IF_ELSE_1:%.*]]
+; CHECK:       if.else.1:
+; CHECK-NEXT:    store i32 2, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING_2:%.*]]
+; CHECK:       if.then.1:
+; CHECK-NEXT:    store i32 1, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING_2]]
+; CHECK:       outer.latch_exiting.1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, 2
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if:                                                     ; preds = %inner.header_latch_exiting
+  %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+  %mod2 = and i32 %ifcond, 1
+  %if.cond = icmp ult i32 %mod2, 0
+  br i1 %if.cond, label %if.then, label %if.else
+
+if.then:                                                      ; preds = %outer.if
+  store i32 1, ptr %if.addr
+  br label %outer.latch_exiting
+
+if.else:                                                      ; preds = %outer.if
+  store i32 2, ptr %if.addr
+  br label %outer.latch_exiting
+
+outer.latch_exiting:                                          ; preds = %if.then, %if.else
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 2
+  br i1 %outer.cond, label %outer.header, label %end
+
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+; If/then/else's condition depends on constants and outer IV, yielding extra
+; cost savings. These are enough to fully unroll the outer loop.
+
+define void @save_ifthenelse(ptr noundef %mem, i32 noundef %inner.ub) {
+; CHECK-LABEL: @save_ifthenelse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK:       inner.header_latch_exiting:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_LATCH_EXITING_1:%.*]]
+; CHECK:       outer.if:
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 1, ptr [[MEM]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING:%.*]]
+; CHECK:       if.else:
+; CHECK-NEXT:    store i32 2, ptr [[MEM]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING]]
+; CHECK:       outer.latch_exiting:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK:       inner.header_latch_exiting.1:
+; CHECK-NEXT:    [[OUTER_IV:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[OUTER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT:    [[OUTER_IV_NEXT_1]] = add nuw nsw i32 [[OUTER_IV]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[OUTER_IV]] to i64
+; CHECK-NEXT:    [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT:    [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT:    [[INNER_COND_1:%.*]] = icmp ult i32 [[OUTER_IV_NEXT_1]], [[INNER_UB]]
+; CHECK-NEXT:    br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]]
+; CHECK:       outer.if.1:
+; CHECK-NEXT:    [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1
+; CHECK-NEXT:    br i1 false, label [[IF_THEN_1:%.*]], label [[IF_ELSE_1:%.*]]
+; CHECK:       if.else.1:
+; CHECK-NEXT:    store i32 2, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING_2:%.*]]
+; CHECK:       if.then.1:
+; CHECK-NEXT:    store i32 1, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING_2]]
+; CHECK:       outer.latch_exiting.1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if:                                                     ; preds = %inner.header_latch_exiting
+  %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+  %mod2 = and i32 %outer.iv, 1
+  %if.cond = icmp ult i32 %mod2, 0
+  br i1 %if.cond, label %if.then, label %if.else
+
+if.then:                                                      ; preds = %outer.if
+  store i32 1, ptr %if.addr
+  br label %outer.latch_exiting
+
+if.else:                                                      ; preds = %outer.if
+  store i32 2, ptr %if.addr
+  br label %outer.latch_exiting
+
+outer.latch_exiting:                                          ; preds = %if.then, %if.else
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 2
+  br i1 %outer.cond, label %outer.header, label %end
+
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}
+
+
+; Tests whether an if/then-like CFG structure is also recognized as a cost
+; saving opportunity. Same double nested loop as before, but the if's else
+; branch is removed and two extra instructions are added to the then branch to
+; maintain the same loop size.
+
+define void @save_ifthen(ptr noundef %mem, i32 noundef %inner.ub) {
+; CHECK-LABEL: @save_ifthen(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[OUTER_HEADER:%.*]]
+; CHECK:       outer.header:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING:%.*]]
+; CHECK:       inner.header_latch_exiting:
+; CHECK-NEXT:    [[INNER_IV:%.*]] = phi i32 [ 0, [[OUTER_HEADER]] ], [ [[INNER_IV_NEXT:%.*]], [[INNER_HEADER_LATCH_EXITING]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT]] = add nuw nsw i32 [[INNER_IV]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT:%.*]] = zext nneg i32 [[INNER_IV]] to i64
+; CHECK-NEXT:    [[ADDR:%.*]] = getelementptr inbounds i8, ptr [[MEM:%.*]], i64 [[INNER_IV_EXT]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR]], align 4
+; CHECK-NEXT:    [[INNER_COND:%.*]] = icmp ult i32 [[INNER_IV_NEXT]], [[INNER_UB:%.*]]
+; CHECK-NEXT:    br i1 [[INNER_COND]], label [[INNER_HEADER_LATCH_EXITING]], label [[OUTER_IF:%.*]]
+; CHECK:       outer.if:
+; CHECK-NEXT:    br i1 false, label [[IF_THEN:%.*]], label [[OUTER_LATCH_EXITING:%.*]]
+; CHECK:       if.then:
+; CHECK-NEXT:    store i32 0, ptr [[MEM]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING]]
+; CHECK:       outer.latch_exiting:
+; CHECK-NEXT:    br label [[INNER_HEADER_LATCH_EXITING_1:%.*]]
+; CHECK:       inner.header_latch_exiting.1:
+; CHECK-NEXT:    [[INNER_IV_1:%.*]] = phi i32 [ 1, [[OUTER_LATCH_EXITING]] ], [ [[INNER_IV_NEXT_1:%.*]], [[INNER_HEADER_LATCH_EXITING_1]] ]
+; CHECK-NEXT:    [[INNER_IV_NEXT_1]] = add nuw nsw i32 [[INNER_IV_1]], 1
+; CHECK-NEXT:    [[INNER_IV_EXT_1:%.*]] = zext nneg i32 [[INNER_IV_1]] to i64
+; CHECK-NEXT:    [[IDX_1:%.*]] = add nuw nsw i64 16, [[INNER_IV_EXT_1]]
+; CHECK-NEXT:    [[ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 [[IDX_1]]
+; CHECK-NEXT:    store i32 0, ptr [[ADDR_1]], align 4
+; CHECK-NEXT:    [[INNER_COND_1:%.*]] = icmp ult i32 [[INNER_IV_NEXT_1]], [[INNER_UB]]
+; CHECK-NEXT:    br i1 [[INNER_COND_1]], label [[INNER_HEADER_LATCH_EXITING_1]], label [[OUTER_IF_1:%.*]]
+; CHECK:       outer.if.1:
+; CHECK-NEXT:    [[IF_ADDR_1:%.*]] = getelementptr inbounds i8, ptr [[MEM]], i64 1
+; CHECK-NEXT:    br i1 false, label [[IF_THEN_1:%.*]], label [[OUTER_LATCH_EXITING_1:%.*]]
+; CHECK:       if.then.1:
+; CHECK-NEXT:    store i32 4, ptr [[IF_ADDR_1]], align 4
+; CHECK-NEXT:    br label [[OUTER_LATCH_EXITING_1]]
+; CHECK:       outer.latch_exiting.1:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %outer.header
+
+outer.header:                                                 ; preds = %entry, %outer.latch_exiting
+  %outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
+  %outer.iv.ext = zext nneg i32 %outer.iv to i64
+  br label %inner.header_latch_exiting
+
+inner.header_latch_exiting:                                   ; preds = %outer.header, %inner.header_latch_exiting
+  %inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
+  %inner.iv_next = add nuw nsw i32 %inner.iv, 1
+  %idx_part = mul nuw nsw i64 %outer.iv.ext, 16
+  %inner.iv.ext = zext nneg i32 %inner.iv to i64
+  %idx = add nuw nsw i64 %idx_part, %inner.iv.ext
+  %addr = getelementptr inbounds i8, ptr %mem, i64 %idx
+  store i32 0, ptr %addr
+  %inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
+  br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.if
+
+outer.if:                                                     ; preds = %inner.header_latch_exiting
+  %if.addr = getelementptr inbounds i8, ptr %mem, i64 %outer.iv.ext
+  %mod2 = and i32 %outer.iv, 1
+  %if.cond = icmp ult i32 %mod2, 0
+  br i1 %if.cond, label %if.then, label %outer.latch_exiting
+
+if.then:                                                      ; preds = %outer.if
+  %mod2x2 = mul i32 %mod2, 2
+  %mod2x2x2 = mul i32 %mod2x2, 2
+  store i32 %mod2x2x2, ptr %if.addr
+  br label %outer.latch_exiting
+
+outer.latch_exiting:                                          ; preds = %if.then, $outer.if
+  %outer.iv_next = add nuw nsw i32 %outer.iv, 1
+  %outer.cond = icmp ult i32 %outer.iv_next, 2
+  br i1 %outer.cond, label %outer.header, label %end
+
+end:                                                          ; preds = %outer.latch_exiting
+  ret void
+}