Skip to content

Commit 2d9e452

Browse files
authored
[LoopUnroll] Introduce parallel reduction phis when unrolling. (#149470)
When partially or runtime unrolling loops with reductions, currently the reductions are performed in-order in the loop, negating most benefits from unrolling such loops. This patch extends unrolling code-gen to keep a parallel reduction phi per unrolled iteration and combining the final result after the loop. For out-of-order CPUs, this allows executing mutliple reduction chains in parallel. For now, the initial transformation is restricted to cases where we unroll a small number of iterations (hard-coded to 4, but should maybe be capped by TTI depending on the execution units), to avoid introducing an excessive amount of parallel phis. It also requires single block loops for now, where the unrolled iterations are known to not exit the loop (either due to runtime unrolling or partial unrolling). This ensures that the unrolled loop will have a single basic block, with a single exit block where we can place the final reduction value computation. The initial implementation also only supports parallelizing loops with a single reduction and only integer reductions. Those restrictions are just to keep the initial implementation simpler, and can easily be lifted as follow-ups. With corresponding TTI to the AArch64 unrolling preferences which I will also share soon, this triggers in ~300 loops across a wide range of workloads, including LLVM itself, ffmgep, av1aom, sqlite, blender, brotli, zstd and more. PR: #149470
1 parent 13875dc commit 2d9e452

File tree

4 files changed

+196
-31
lines changed

4 files changed

+196
-31
lines changed

llvm/include/llvm/Transforms/Utils/UnrollLoop.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ LLVM_ABI bool computeUnrollCount(
163163
TargetTransformInfo::UnrollingPreferences &UP,
164164
TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound);
165165

166+
LLVM_ABI std::optional<RecurrenceDescriptor>
167+
canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
168+
ScalarEvolution *SE);
166169
} // end namespace llvm
167170

168171
#endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H

llvm/lib/Transforms/Utils/LoopUnroll.cpp

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "llvm/IR/DiagnosticInfo.h"
4242
#include "llvm/IR/Dominators.h"
4343
#include "llvm/IR/Function.h"
44+
#include "llvm/IR/IRBuilder.h"
4445
#include "llvm/IR/Instruction.h"
4546
#include "llvm/IR/Instructions.h"
4647
#include "llvm/IR/IntrinsicInst.h"
@@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
108109
#endif
109110
);
110111

112+
static cl::opt<bool> UnrollAddParallelReductions(
113+
"unroll-add-parallel-reductions", cl::init(false), cl::Hidden,
114+
cl::desc("Allow unrolling to add parallel reduction phis."));
111115

112116
/// Check if unrolling created a situation where we need to insert phi nodes to
113117
/// preserve LCSSA form.
@@ -660,6 +664,39 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
660664
OrigPHINode.push_back(cast<PHINode>(I));
661665
}
662666

667+
// Collect phi nodes for reductions for which we can introduce multiple
668+
// parallel reduction phis and compute the final reduction result after the
669+
// loop. This requires a single exit block after unrolling. This is ensured by
670+
// restricting to single-block loops where the unrolled iterations are known
671+
// to not exit.
672+
DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
673+
bool CanAddAdditionalAccumulators =
674+
UnrollAddParallelReductions && !CompletelyUnroll &&
675+
L->getNumBlocks() == 1 &&
676+
(ULO.Runtime ||
677+
(ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
678+
ExitInfos[Header].BreakoutTrip == 0))));
679+
680+
// Limit parallelizing reductions to unroll counts of 4 or less for now.
681+
// TODO: The number of parallel reductions should depend on the number of
682+
// execution units. We also don't have to add a parallel reduction phi per
683+
// unrolled iteration, but could for example add a parallel phi for every 2
684+
// unrolled iterations.
685+
if (CanAddAdditionalAccumulators && ULO.Count <= 4) {
686+
for (PHINode &Phi : Header->phis()) {
687+
auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
688+
if (!RdxDesc)
689+
continue;
690+
691+
// Only handle duplicate phis for a single reduction for now.
692+
// TODO: Handle any number of reductions
693+
if (!Reductions.empty())
694+
continue;
695+
696+
Reductions[&Phi] = *RdxDesc;
697+
}
698+
}
699+
663700
std::vector<BasicBlock *> Headers;
664701
std::vector<BasicBlock *> Latches;
665702
Headers.push_back(Header);
@@ -710,6 +747,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
710747
// latch. This is a reasonable default placement if we don't have block
711748
// frequencies, and if we do, well the layout will be adjusted later.
712749
auto BlockInsertPt = std::next(LatchBlock->getIterator());
750+
SmallVector<Instruction *> PartialReductions;
713751
for (unsigned It = 1; It != ULO.Count; ++It) {
714752
SmallVector<BasicBlock *, 8> NewBlocks;
715753
SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -733,6 +771,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
733771
for (PHINode *OrigPHI : OrigPHINode) {
734772
PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
735773
Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
774+
775+
// Use cloned phis as parallel phis for partial reductions, which will
776+
// get combined to the final reduction result after the loop.
777+
if (Reductions.contains(OrigPHI)) {
778+
// Collect partial reduction results.
779+
if (PartialReductions.empty())
780+
PartialReductions.push_back(cast<Instruction>(InVal));
781+
PartialReductions.push_back(cast<Instruction>(VMap[InVal]));
782+
783+
// Update the start value for the cloned phis to use the identity
784+
// value for the reduction.
785+
const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI];
786+
NewPHI->setIncomingValueForBlock(
787+
L->getLoopPreheader(),
788+
getRecurrenceIdentity(RdxDesc.getRecurrenceKind(),
789+
OrigPHI->getType(),
790+
RdxDesc.getFastMathFlags()));
791+
792+
// Update NewPHI to use the cloned value for the iteration and move
793+
// to header.
794+
NewPHI->replaceUsesOfWith(InVal, VMap[InVal]);
795+
NewPHI->moveBefore(OrigPHI->getIterator());
796+
continue;
797+
}
798+
736799
if (Instruction *InValI = dyn_cast<Instruction>(InVal))
737800
if (It > 1 && L->contains(InValI))
738801
InVal = LastValueMap[InValI];
@@ -832,6 +895,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
832895
PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
833896
PN->eraseFromParent();
834897
} else if (ULO.Count > 1) {
898+
if (Reductions.contains(PN))
899+
continue;
900+
835901
Value *InVal = PN->removeIncomingValue(LatchBlock, false);
836902
// If this value was defined in the loop, take the value defined by the
837903
// last iteration of the loop.
@@ -1010,6 +1076,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
10101076
}
10111077
}
10121078

1079+
// If there are partial reductions, create code in the exit block to compute
1080+
// the final result and update users of the final result.
1081+
if (!PartialReductions.empty()) {
1082+
BasicBlock *ExitBlock = L->getExitBlock();
1083+
assert(ExitBlock &&
1084+
"Can only introduce parallel reduction phis with single exit block");
1085+
assert(Reductions.size() == 1 &&
1086+
"currently only a single reduction is supported");
1087+
Value *FinalRdxValue = PartialReductions.back();
1088+
Value *RdxResult = nullptr;
1089+
for (PHINode &Phi : ExitBlock->phis()) {
1090+
if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue)
1091+
continue;
1092+
if (!RdxResult) {
1093+
RdxResult = PartialReductions.front();
1094+
IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
1095+
RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
1096+
for (Instruction *RdxPart : drop_begin(PartialReductions)) {
1097+
RdxResult = Builder.CreateBinOp(
1098+
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
1099+
RdxPart, RdxResult, "bin.rdx");
1100+
}
1101+
NeedToFixLCSSA = true;
1102+
for (Instruction *RdxPart : PartialReductions)
1103+
RdxPart->dropPoisonGeneratingFlags();
1104+
}
1105+
1106+
Phi.replaceAllUsesWith(RdxResult);
1107+
continue;
1108+
}
1109+
}
1110+
10131111
if (DTUToUse) {
10141112
// Apply updates to the DomTree.
10151113
DT = &DTU.getDomTree();
@@ -1111,3 +1209,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
11111209
}
11121210
return nullptr;
11131211
}
1212+
1213+
std::optional<RecurrenceDescriptor>
1214+
llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
1215+
ScalarEvolution *SE) {
1216+
RecurrenceDescriptor RdxDesc;
1217+
if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc,
1218+
/*DemandedBits=*/nullptr,
1219+
/*AC=*/nullptr, /*DT=*/nullptr, SE))
1220+
return std::nullopt;
1221+
RecurKind RK = RdxDesc.getRecurrenceKind();
1222+
// Skip unsupported reductions.
1223+
// TODO: Handle additional reductions, including FP and min-max
1224+
// reductions.
1225+
if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
1226+
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
1227+
RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
1228+
RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
1229+
return std::nullopt;
1230+
1231+
if (RdxDesc.IntermediateStore)
1232+
return std::nullopt;
1233+
1234+
// Don't unroll reductions with constant ops; those can be folded to a
1235+
// single induction update.
1236+
if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch()))
1237+
->operands(),
1238+
IsaPred<Constant>))
1239+
return std::nullopt;
1240+
1241+
BasicBlock *Latch = L->getLoopLatch();
1242+
if (!Latch ||
1243+
!is_contained(
1244+
cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(),
1245+
&Phi))
1246+
return std::nullopt;
1247+
1248+
return RdxDesc;
1249+
}

llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -p loop-unroll -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
2+
; RUN: opt -p loop-unroll -unroll-add-parallel-reductions -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
33

44
define i32 @test_add(ptr %src, i64 %n, i32 %start) {
55
; CHECK-LABEL: define i32 @test_add(
@@ -8,27 +8,33 @@ define i32 @test_add(ptr %src, i64 %n, i32 %start) {
88
; CHECK-NEXT: br label %[[LOOP:.*]]
99
; CHECK: [[LOOP]]:
1010
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
11-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
11+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
12+
; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
13+
; CHECK-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ]
14+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
1215
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
1316
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
1417
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
15-
; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
18+
; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
1619
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
1720
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
1821
; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
19-
; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
22+
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_1]], [[L_1]]
2023
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
2124
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
2225
; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
23-
; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
26+
; CHECK-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_NEXT_1]], [[L_2]]
2427
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
2528
; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
2629
; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
27-
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
30+
; CHECK-NEXT: [[RDX_NEXT_24]] = add i32 [[RDX_3]], [[L_24]]
2831
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
2932
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
3033
; CHECK: [[EXIT]]:
31-
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
34+
; CHECK-NEXT: [[RDX_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ]
35+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_3]], [[RDX_NEXT]]
36+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
37+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = add i32 [[RDX_NEXT_24]], [[BIN_RDX1]]
3238
; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]]
3339
;
3440
entry:
@@ -203,33 +209,39 @@ define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) {
203209
; CHECK-NEXT: br label %[[LOOP:.*]]
204210
; CHECK: [[LOOP]]:
205211
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
206-
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ]
212+
; CHECK-NEXT: [[RDX_1_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_1:%.*]], %[[LOOP]] ]
213+
; CHECK-NEXT: [[RDX_1_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_2:%.*]], %[[LOOP]] ]
214+
; CHECK-NEXT: [[RDX_1_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_24:%.*]], %[[LOOP]] ]
215+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
207216
; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
208217
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
209218
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
210219
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
211-
; CHECK-NEXT: [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]]
220+
; CHECK-NEXT: [[RDX_1_NEXT]] = add i32 [[RDX_1]], [[L]]
212221
; CHECK-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]]
213222
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
214223
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
215224
; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
216-
; CHECK-NEXT: [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]]
225+
; CHECK-NEXT: [[RDX_1_NEXT_1]] = add i32 [[RDX_1_1]], [[L_1]]
217226
; CHECK-NEXT: [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]]
218227
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
219228
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
220229
; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
221-
; CHECK-NEXT: [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]]
230+
; CHECK-NEXT: [[RDX_1_NEXT_2]] = add i32 [[RDX_1_2]], [[L_2]]
222231
; CHECK-NEXT: [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]]
223232
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
224233
; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
225234
; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
226-
; CHECK-NEXT: [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]]
235+
; CHECK-NEXT: [[RDX_1_NEXT_24]] = add i32 [[RDX_1_3]], [[L_24]]
227236
; CHECK-NEXT: [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]]
228237
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
229238
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
230239
; CHECK: [[EXIT]]:
231-
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ]
240+
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_1_NEXT_24]], %[[LOOP]] ]
232241
; CHECK-NEXT: [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
242+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_1_NEXT_1]], [[RDX_1_NEXT]]
243+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_1_NEXT_2]], [[BIN_RDX]]
244+
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = add i32 [[RDX_1_NEXT_24]], [[BIN_RDX1]]
233245
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]]
234246
; CHECK-NEXT: ret i32 [[RES]]
235247
;
@@ -509,20 +521,26 @@ define i32 @test_add_with_call(i64 %n, i32 %start) {
509521
; CHECK-NEXT: br label %[[LOOP:.*]]
510522
; CHECK: [[LOOP]]:
511523
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
512-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
524+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
525+
; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
526+
; CHECK-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
527+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
513528
; CHECK-NEXT: [[L:%.*]] = call i32 @foo()
514-
; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
529+
; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
515530
; CHECK-NEXT: [[L_1:%.*]] = call i32 @foo()
516-
; CHECK-NEXT: [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
531+
; CHECK-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[L_1]]
517532
; CHECK-NEXT: [[L_2:%.*]] = call i32 @foo()
518-
; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
533+
; CHECK-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[L_2]]
519534
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
520535
; CHECK-NEXT: [[L_3:%.*]] = call i32 @foo()
521-
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
536+
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[L_3]]
522537
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
523538
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
524539
; CHECK: [[EXIT]]:
525-
; CHECK-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
540+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
541+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
542+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
543+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX1]]
526544
; CHECK-NEXT: ret i32 [[BIN_RDX2]]
527545
;
528546
entry:
@@ -550,35 +568,41 @@ define i32 @test_add_with_backward_dep(ptr %p, i64 %n, i32 %start) {
550568
; CHECK-NEXT: br label %[[LOOP:.*]]
551569
; CHECK: [[LOOP]]:
552570
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
553-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
571+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
572+
; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
573+
; CHECK-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
574+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
554575
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
555576
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
556577
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4
557578
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
558579
; CHECK-NEXT: store i32 0, ptr [[GEP_1]], align 4
559-
; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
580+
; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
560581
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
561582
; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
562583
; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_11]], align 4
563584
; CHECK-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
564585
; CHECK-NEXT: store i32 0, ptr [[GEP_1_1]], align 4
565-
; CHECK-NEXT: [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
586+
; CHECK-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[L_1]]
566587
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
567588
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
568589
; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_2]], align 4
569590
; CHECK-NEXT: [[GEP_1_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
570591
; CHECK-NEXT: store i32 0, ptr [[GEP_1_2]], align 4
571-
; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
592+
; CHECK-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[L_2]]
572593
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
573594
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
574595
; CHECK-NEXT: [[L_3:%.*]] = load i32, ptr [[GEP_3]], align 4
575596
; CHECK-NEXT: [[GEP_1_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_3]]
576597
; CHECK-NEXT: store i32 0, ptr [[GEP_1_3]], align 4
577-
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
598+
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[L_3]]
578599
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
579600
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
580601
; CHECK: [[EXIT]]:
581-
; CHECK-NEXT: [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
602+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
603+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
604+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
605+
; CHECK-NEXT: [[BIN_RDX3:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]]
582606
; CHECK-NEXT: ret i32 [[BIN_RDX3]]
583607
;
584608
entry:

0 commit comments

Comments
 (0)