Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions llvm/include/llvm/Analysis/TargetTransformInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -638,6 +638,8 @@ class TargetTransformInfo {
/// Fall back to the generic logic to determine whether multi-exit unrolling
/// is profitable if set to false.
bool RuntimeUnrollMultiExit;
/// Allow unrolling to add parallel reduction phis.
bool AddAdditionalAccumulators;
};

/// Get target-customized preferences for the generic loop unrolling
Expand Down
1 change: 1 addition & 0 deletions llvm/include/llvm/Transforms/Utils/UnrollLoop.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ struct UnrollLoopOptions {
const Instruction *Heart = nullptr;
unsigned SCEVExpansionBudget;
bool RuntimeUnrollMultiExit = false;
bool AddAdditionalAccumulators = false;
};

LLVM_ABI LoopUnrollResult UnrollLoop(Loop *L, UnrollLoopOptions ULO,
Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
#include "llvm/Support/Debug.h"
#include "llvm/TargetParser/AArch64TargetParser.h"
#include "llvm/Transforms/InstCombine/InstCombiner.h"
#include "llvm/Transforms/Utils/UnrollLoop.h"
#include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
#include <algorithm>
#include <optional>
Expand Down Expand Up @@ -4969,6 +4970,23 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
if (!L->getExitBlock())
return;

// Check if the loop contains any reductions that could be parallelized when
// unrolling. If so, enable partial unrolling, if the trip count is know to be
// a multiple of 2.
bool HasParellelizableReductions =
L->getNumBlocks() == 1 &&
any_of(L->getHeader()->phis(),
[&SE, L](PHINode &Phi) {
return canParallelizeReductionWhenUnrolling(Phi, L, &SE);
}) &&
isLoopSizeWithinBudget(L, TTI, 12, nullptr);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we somehow get the unroller itself to compute whether it will actually generate a parallelized reduction, and pass that down to getUnrollingPreferences, instead of trying to recompute it in target-specific code?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had a look, but couldn't really find a good way.

We could add parallelizable reductions to the unrolling preferences, and detect them before handing off to TTI. But then we would always need to perform the reduction detection, which would come with some compile-time cost.

Perhaps there is a different alternative?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't really see any alternative that allows doing the analysis lazily, no.

if (HasParellelizableReductions &&
SE.getSmallConstantTripMultiple(L, L->getExitingBlock()) % 2 == 0) {
UP.Partial = true;
UP.MaxCount = 4;
UP.AddAdditionalAccumulators = true;
}

const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount(L);
if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
(SE.getSmallConstantMaxTripCount(L) > 0 &&
Expand All @@ -4984,6 +5002,12 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
// Limit to loops with trip counts that are cheap to expand.
UP.SCEVExpansionBudget = 1;

if (HasParellelizableReductions) {
UP.Runtime = true;
UP.DefaultUnrollRuntimeCount = 4;
UP.AddAdditionalAccumulators = true;
}

// Try to unroll small loops, of few-blocks with low budget, if they have
// load/store dependencies, to expose more parallel memory access streams,
// or if they do little work inside a block (i.e. load -> X -> store pattern).
Expand Down
2 changes: 2 additions & 0 deletions llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,7 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
UP.SCEVExpansionBudget = SCEVCheapExpansionBudget;
UP.RuntimeUnrollMultiExit = false;
UP.AddAdditionalAccumulators = false;

// Override with any target specific settings
TTI.getUnrollingPreferences(L, SE, UP, &ORE);
Expand Down Expand Up @@ -1354,6 +1355,7 @@ tryToUnrollLoop(Loop *L, DominatorTree &DT, LoopInfo *LI, ScalarEvolution &SE,
ULO.Heart = getLoopConvergenceHeart(L);
ULO.SCEVExpansionBudget = UP.SCEVExpansionBudget;
ULO.RuntimeUnrollMultiExit = UP.RuntimeUnrollMultiExit;
ULO.AddAdditionalAccumulators = UP.AddAdditionalAccumulators;
LoopUnrollResult UnrollResult = UnrollLoop(
L, ULO, LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop, AA);
if (UnrollResult == LoopUnrollResult::Unmodified)
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Transforms/Utils/LoopUnroll.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -671,8 +671,10 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
// to not exit.
DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
bool CanAddAdditionalAccumulators =
UnrollAddParallelReductions && !CompletelyUnroll &&
L->getNumBlocks() == 1 &&
(UnrollAddParallelReductions.getNumOccurrences() > 0
? UnrollAddParallelReductions
: ULO.AddAdditionalAccumulators) &&
!CompletelyUnroll && L->getNumBlocks() == 1 &&
(ULO.Runtime ||
(ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
ExitInfos[Header].BreakoutTrip == 0))));
Expand Down
140 changes: 117 additions & 23 deletions llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll
Original file line number Diff line number Diff line change
Expand Up @@ -783,16 +783,34 @@ define i32 @test_add_reduction_unroll_partial(ptr %a, i64 noundef %n) {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
; APPLE-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP0]]
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
; APPLE-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
; APPLE-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP1]]
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
; APPLE-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2
; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP2]]
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2
; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP3]]
; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
; APPLE-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024
; APPLE-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
; APPLE-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
; APPLE-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX1]]
; APPLE-NEXT: ret i32 [[BIN_RDX2]]
;
; OTHER-LABEL: define i32 @test_add_reduction_unroll_partial(
Expand Down Expand Up @@ -923,21 +941,42 @@ define i32 @test_add_and_mul_reduction_unroll_partial(ptr %a, i64 noundef %n) {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_21:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RES_2:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[GEP_A:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV]]
; APPLE-NEXT: [[TMP0:%.*]] = load i32, ptr [[GEP_A]], align 2
; APPLE-NEXT: [[RDX_NEXT]] = add nuw nsw i32 [[RDX]], [[TMP0]]
; APPLE-NEXT: [[RDX_2_NEXT]] = mul i32 [[RDX_2]], [[TMP0]]
; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
; APPLE-NEXT: [[RES_2]] = add i32 [[RDX]], [[TMP0]]
; APPLE-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[TMP0]]
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
; APPLE-NEXT: [[TMP1:%.*]] = load i32, ptr [[GEP_A_1]], align 2
; APPLE-NEXT: [[BIN_RDX3]] = add i32 [[RDX_1]], [[TMP1]]
; APPLE-NEXT: [[RDX_2_NEXT_1:%.*]] = mul i32 [[RDX_2_NEXT]], [[TMP1]]
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
; APPLE-NEXT: [[TMP2:%.*]] = load i32, ptr [[GEP_A_2]], align 2
; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_21]], [[TMP2]]
; APPLE-NEXT: [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_NEXT_1]], [[TMP2]]
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_3]], align 2
; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP3]]
; APPLE-NEXT: [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[TMP3]]
; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
; APPLE-NEXT: [[EC_3:%.*]] = icmp eq i64 [[IV_NEXT_3]], 1024
; APPLE-NEXT: br i1 [[EC_3]], label %[[EXIT:.*]], label %[[LOOP]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT]], %[[LOOP]] ]
; APPLE-NEXT: [[RES_2:%.*]] = phi i32 [ [[RDX_2_NEXT]], %[[LOOP]] ]
; APPLE-NEXT: [[RES_1:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
; APPLE-NEXT: [[RES_3:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
; APPLE-NEXT: [[SUM:%.*]] = add i32 [[BIN_RDX3]], [[RES_2]]
; APPLE-NEXT: ret i32 [[SUM]]
; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[SUM]]
; APPLE-NEXT: [[BIN_RDX4:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]]
; APPLE-NEXT: [[SUM1:%.*]] = add i32 [[BIN_RDX4]], [[RES_3]]
; APPLE-NEXT: ret i32 [[SUM1]]
;
; OTHER-LABEL: define i32 @test_add_and_mul_reduction_unroll_partial(
; OTHER-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
Expand Down Expand Up @@ -992,18 +1031,72 @@ define i32 @test_add_reduction_runtime(ptr %a, i64 noundef %n) {
; APPLE-LABEL: define i32 @test_add_reduction_runtime(
; APPLE-SAME: ptr [[A:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0]] {
; APPLE-NEXT: [[ENTRY:.*]]:
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 3
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 3
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
; APPLE: [[ENTRY_NEW]]:
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
; APPLE-NEXT: br label %[[LOOP:.*]]
; APPLE: [[LOOP]]:
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[IV_EPIL:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX:%.*]] = phi i32 [ 0, %[[ENTRY_NEW]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_3:%.*]], %[[LOOP]] ]
; APPLE-NEXT: [[GEP_A_EPIL:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL]]
; APPLE-NEXT: [[TMP6:%.*]] = load i32, ptr [[GEP_A_EPIL]], align 2
; APPLE-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP6]]
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL]], 1
; APPLE-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[TMP6]]
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV_EPIL]], 1
; APPLE-NEXT: [[GEP_A_1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT]]
; APPLE-NEXT: [[TMP3:%.*]] = load i32, ptr [[GEP_A_1]], align 2
; APPLE-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[TMP3]]
; APPLE-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV_EPIL]], 2
; APPLE-NEXT: [[GEP_A_2:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_1]]
; APPLE-NEXT: [[TMP4:%.*]] = load i32, ptr [[GEP_A_2]], align 2
; APPLE-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[TMP4]]
; APPLE-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV_EPIL]], 3
; APPLE-NEXT: [[GEP_A_3:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_NEXT_2]]
; APPLE-NEXT: [[TMP5:%.*]] = load i32, ptr [[GEP_A_3]], align 2
; APPLE-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[TMP5]]
; APPLE-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV_EPIL]], 4
; APPLE-NEXT: [[NITER_NEXT_3]] = add nuw i64 [[NITER]], 4
; APPLE-NEXT: [[NITER_NCMP_3:%.*]] = icmp eq i64 [[NITER_NEXT_3]], [[UNROLL_ITER]]
; APPLE-NEXT: br i1 [[NITER_NCMP_3]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
; APPLE-NEXT: [[RES_PH_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_3]], %[[LOOP]] ]
; APPLE-NEXT: [[RDX_UNR_PH:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
; APPLE-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
; APPLE-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
; APPLE-NEXT: [[BIN_RDX3:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]]
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
; APPLE: [[EXIT_UNR_LCSSA]]:
; APPLE-NEXT: [[RES_PH:%.*]] = phi i32 [ poison, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[RDX_UNR:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[BIN_RDX3]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
; APPLE: [[LOOP_EPIL_PREHEADER]]:
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
; APPLE: [[LOOP_EPIL]]:
; APPLE-NEXT: [[IV_EPIL1:%.*]] = phi i64 [ [[IV_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[IV_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
; APPLE-NEXT: [[RDX_EPIL:%.*]] = phi i32 [ [[RDX_UNR]], %[[LOOP_EPIL_PREHEADER]] ], [ [[RDX_NEXT_EPIL:%.*]], %[[LOOP_EPIL]] ]
; APPLE-NEXT: [[EPIL_ITER:%.*]] = phi i64 [ 0, %[[LOOP_EPIL_PREHEADER]] ], [ [[EPIL_ITER_NEXT:%.*]], %[[LOOP_EPIL]] ]
; APPLE-NEXT: [[GEP_A_EPIL1:%.*]] = getelementptr inbounds nuw i32, ptr [[A]], i64 [[IV_EPIL1]]
; APPLE-NEXT: [[TMP7:%.*]] = load i32, ptr [[GEP_A_EPIL1]], align 2
; APPLE-NEXT: [[RDX_NEXT_EPIL]] = add nuw nsw i32 [[RDX_EPIL]], [[TMP7]]
; APPLE-NEXT: [[IV_NEXT_EPIL]] = add nuw nsw i64 [[IV_EPIL1]], 1
; APPLE-NEXT: [[EC_EPIL:%.*]] = icmp eq i64 [[IV_NEXT_EPIL]], [[N]]
; APPLE-NEXT: br i1 [[EC_EPIL]], label %[[EXIT:.*]], label %[[LOOP]]
; APPLE-NEXT: [[EPIL_ITER_NEXT]] = add i64 [[EPIL_ITER]], 1
; APPLE-NEXT: [[EPIL_ITER_CMP:%.*]] = icmp ne i64 [[EPIL_ITER_NEXT]], [[XTRAITER]]
; APPLE-NEXT: br i1 [[EPIL_ITER_CMP]], label %[[LOOP_EPIL]], label %[[EXIT_EPILOG_LCSSA:.*]], !llvm.loop [[LOOP3:![0-9]+]]
; APPLE: [[EXIT_EPILOG_LCSSA]]:
; APPLE-NEXT: [[RES_PH1:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP_EPIL]] ]
; APPLE-NEXT: br label %[[EXIT]]
; APPLE: [[EXIT]]:
; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RDX_NEXT_EPIL]], %[[LOOP]] ]
; APPLE-NEXT: [[RES:%.*]] = phi i32 [ [[RES_PH]], %[[EXIT_UNR_LCSSA]] ], [ [[RES_PH1]], %[[EXIT_EPILOG_LCSSA]] ]
; APPLE-NEXT: ret i32 [[RES]]
;
; OTHER-LABEL: define i32 @test_add_reduction_runtime(
Expand Down Expand Up @@ -1092,6 +1185,7 @@ exit:
; APPLE: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; APPLE: [[META1]] = !{!"llvm.loop.unroll.disable"}
; APPLE: [[LOOP2]] = distinct !{[[LOOP2]], [[META1]]}
; APPLE: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]]}
;.
; OTHER: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
; OTHER: [[META1]] = !{!"llvm.loop.unroll.disable"}
Expand Down