Skip to content

Commit 66fde79

Browse files
author
git apple-llvm automerger
committed
Merge commit '2d9e452ab0f3' from llvm.org/main into next
2 parents f9a356c + 2d9e452 commit 66fde79

File tree

4 files changed

+196
-31
lines changed

4 files changed

+196
-31
lines changed

llvm/include/llvm/Transforms/Utils/UnrollLoop.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,9 @@ LLVM_ABI bool computeUnrollCount(
163163
TargetTransformInfo::UnrollingPreferences &UP,
164164
TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound);
165165

166+
LLVM_ABI std::optional<RecurrenceDescriptor>
167+
canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
168+
ScalarEvolution *SE);
166169
} // end namespace llvm
167170

168171
#endif // LLVM_TRANSFORMS_UTILS_UNROLLLOOP_H

llvm/lib/Transforms/Utils/LoopUnroll.cpp

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@
4141
#include "llvm/IR/DiagnosticInfo.h"
4242
#include "llvm/IR/Dominators.h"
4343
#include "llvm/IR/Function.h"
44+
#include "llvm/IR/IRBuilder.h"
4445
#include "llvm/IR/Instruction.h"
4546
#include "llvm/IR/Instructions.h"
4647
#include "llvm/IR/IntrinsicInst.h"
@@ -108,6 +109,9 @@ UnrollVerifyLoopInfo("unroll-verify-loopinfo", cl::Hidden,
108109
#endif
109110
);
110111

112+
static cl::opt<bool> UnrollAddParallelReductions(
113+
"unroll-add-parallel-reductions", cl::init(false), cl::Hidden,
114+
cl::desc("Allow unrolling to add parallel reduction phis."));
111115

112116
/// Check if unrolling created a situation where we need to insert phi nodes to
113117
/// preserve LCSSA form.
@@ -660,6 +664,39 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
660664
OrigPHINode.push_back(cast<PHINode>(I));
661665
}
662666

667+
// Collect phi nodes for reductions for which we can introduce multiple
668+
// parallel reduction phis and compute the final reduction result after the
669+
// loop. This requires a single exit block after unrolling. This is ensured by
670+
// restricting to single-block loops where the unrolled iterations are known
671+
// to not exit.
672+
DenseMap<PHINode *, RecurrenceDescriptor> Reductions;
673+
bool CanAddAdditionalAccumulators =
674+
UnrollAddParallelReductions && !CompletelyUnroll &&
675+
L->getNumBlocks() == 1 &&
676+
(ULO.Runtime ||
677+
(ExitInfos.contains(Header) && ((ExitInfos[Header].TripCount != 0 &&
678+
ExitInfos[Header].BreakoutTrip == 0))));
679+
680+
// Limit parallelizing reductions to unroll counts of 4 or less for now.
681+
// TODO: The number of parallel reductions should depend on the number of
682+
// execution units. We also don't have to add a parallel reduction phi per
683+
// unrolled iteration, but could for example add a parallel phi for every 2
684+
// unrolled iterations.
685+
if (CanAddAdditionalAccumulators && ULO.Count <= 4) {
686+
for (PHINode &Phi : Header->phis()) {
687+
auto RdxDesc = canParallelizeReductionWhenUnrolling(Phi, L, SE);
688+
if (!RdxDesc)
689+
continue;
690+
691+
// Only handle duplicate phis for a single reduction for now.
692+
// TODO: Handle any number of reductions
693+
if (!Reductions.empty())
694+
continue;
695+
696+
Reductions[&Phi] = *RdxDesc;
697+
}
698+
}
699+
663700
std::vector<BasicBlock *> Headers;
664701
std::vector<BasicBlock *> Latches;
665702
Headers.push_back(Header);
@@ -710,6 +747,7 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
710747
// latch. This is a reasonable default placement if we don't have block
711748
// frequencies, and if we do, well the layout will be adjusted later.
712749
auto BlockInsertPt = std::next(LatchBlock->getIterator());
750+
SmallVector<Instruction *> PartialReductions;
713751
for (unsigned It = 1; It != ULO.Count; ++It) {
714752
SmallVector<BasicBlock *, 8> NewBlocks;
715753
SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
@@ -733,6 +771,31 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
733771
for (PHINode *OrigPHI : OrigPHINode) {
734772
PHINode *NewPHI = cast<PHINode>(VMap[OrigPHI]);
735773
Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
774+
775+
// Use cloned phis as parallel phis for partial reductions, which will
776+
// get combined to the final reduction result after the loop.
777+
if (Reductions.contains(OrigPHI)) {
778+
// Collect partial reduction results.
779+
if (PartialReductions.empty())
780+
PartialReductions.push_back(cast<Instruction>(InVal));
781+
PartialReductions.push_back(cast<Instruction>(VMap[InVal]));
782+
783+
// Update the start value for the cloned phis to use the identity
784+
// value for the reduction.
785+
const RecurrenceDescriptor &RdxDesc = Reductions[OrigPHI];
786+
NewPHI->setIncomingValueForBlock(
787+
L->getLoopPreheader(),
788+
getRecurrenceIdentity(RdxDesc.getRecurrenceKind(),
789+
OrigPHI->getType(),
790+
RdxDesc.getFastMathFlags()));
791+
792+
// Update NewPHI to use the cloned value for the iteration and move
793+
// to header.
794+
NewPHI->replaceUsesOfWith(InVal, VMap[InVal]);
795+
NewPHI->moveBefore(OrigPHI->getIterator());
796+
continue;
797+
}
798+
736799
if (Instruction *InValI = dyn_cast<Instruction>(InVal))
737800
if (It > 1 && L->contains(InValI))
738801
InVal = LastValueMap[InValI];
@@ -832,6 +895,9 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
832895
PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
833896
PN->eraseFromParent();
834897
} else if (ULO.Count > 1) {
898+
if (Reductions.contains(PN))
899+
continue;
900+
835901
Value *InVal = PN->removeIncomingValue(LatchBlock, false);
836902
// If this value was defined in the loop, take the value defined by the
837903
// last iteration of the loop.
@@ -1010,6 +1076,38 @@ llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
10101076
}
10111077
}
10121078

1079+
// If there are partial reductions, create code in the exit block to compute
1080+
// the final result and update users of the final result.
1081+
if (!PartialReductions.empty()) {
1082+
BasicBlock *ExitBlock = L->getExitBlock();
1083+
assert(ExitBlock &&
1084+
"Can only introduce parallel reduction phis with single exit block");
1085+
assert(Reductions.size() == 1 &&
1086+
"currently only a single reduction is supported");
1087+
Value *FinalRdxValue = PartialReductions.back();
1088+
Value *RdxResult = nullptr;
1089+
for (PHINode &Phi : ExitBlock->phis()) {
1090+
if (Phi.getIncomingValueForBlock(L->getLoopLatch()) != FinalRdxValue)
1091+
continue;
1092+
if (!RdxResult) {
1093+
RdxResult = PartialReductions.front();
1094+
IRBuilder Builder(ExitBlock, ExitBlock->getFirstNonPHIIt());
1095+
RecurKind RK = Reductions.begin()->second.getRecurrenceKind();
1096+
for (Instruction *RdxPart : drop_begin(PartialReductions)) {
1097+
RdxResult = Builder.CreateBinOp(
1098+
(Instruction::BinaryOps)RecurrenceDescriptor::getOpcode(RK),
1099+
RdxPart, RdxResult, "bin.rdx");
1100+
}
1101+
NeedToFixLCSSA = true;
1102+
for (Instruction *RdxPart : PartialReductions)
1103+
RdxPart->dropPoisonGeneratingFlags();
1104+
}
1105+
1106+
Phi.replaceAllUsesWith(RdxResult);
1107+
continue;
1108+
}
1109+
}
1110+
10131111
if (DTUToUse) {
10141112
// Apply updates to the DomTree.
10151113
DT = &DTU.getDomTree();
@@ -1111,3 +1209,41 @@ MDNode *llvm::GetUnrollMetadata(MDNode *LoopID, StringRef Name) {
11111209
}
11121210
return nullptr;
11131211
}
1212+
1213+
std::optional<RecurrenceDescriptor>
1214+
llvm::canParallelizeReductionWhenUnrolling(PHINode &Phi, Loop *L,
1215+
ScalarEvolution *SE) {
1216+
RecurrenceDescriptor RdxDesc;
1217+
if (!RecurrenceDescriptor::isReductionPHI(&Phi, L, RdxDesc,
1218+
/*DemandedBits=*/nullptr,
1219+
/*AC=*/nullptr, /*DT=*/nullptr, SE))
1220+
return std::nullopt;
1221+
RecurKind RK = RdxDesc.getRecurrenceKind();
1222+
// Skip unsupported reductions.
1223+
// TODO: Handle additional reductions, including FP and min-max
1224+
// reductions.
1225+
if (!RecurrenceDescriptor::isIntegerRecurrenceKind(RK) ||
1226+
RecurrenceDescriptor::isAnyOfRecurrenceKind(RK) ||
1227+
RecurrenceDescriptor::isFindIVRecurrenceKind(RK) ||
1228+
RecurrenceDescriptor::isMinMaxRecurrenceKind(RK))
1229+
return std::nullopt;
1230+
1231+
if (RdxDesc.IntermediateStore)
1232+
return std::nullopt;
1233+
1234+
// Don't unroll reductions with constant ops; those can be folded to a
1235+
// single induction update.
1236+
if (any_of(cast<Instruction>(Phi.getIncomingValueForBlock(L->getLoopLatch()))
1237+
->operands(),
1238+
IsaPred<Constant>))
1239+
return std::nullopt;
1240+
1241+
BasicBlock *Latch = L->getLoopLatch();
1242+
if (!Latch ||
1243+
!is_contained(
1244+
cast<Instruction>(Phi.getIncomingValueForBlock(Latch))->operands(),
1245+
&Phi))
1246+
return std::nullopt;
1247+
1248+
return RdxDesc;
1249+
}

llvm/test/Transforms/LoopUnroll/partial-unroll-reductions.ll

Lines changed: 49 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2-
; RUN: opt -p loop-unroll -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
2+
; RUN: opt -p loop-unroll -unroll-add-parallel-reductions -unroll-allow-partial -unroll-max-count=4 -S %s | FileCheck %s
33

44
define i32 @test_add(ptr %src, i64 %n, i32 %start) {
55
; CHECK-LABEL: define i32 @test_add(
@@ -8,27 +8,33 @@ define i32 @test_add(ptr %src, i64 %n, i32 %start) {
88
; CHECK-NEXT: br label %[[LOOP:.*]]
99
; CHECK: [[LOOP]]:
1010
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
11-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
11+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
12+
; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
13+
; CHECK-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_24:%.*]], %[[LOOP]] ]
14+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
1215
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
1316
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
1417
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
15-
; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
18+
; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
1619
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
1720
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
1821
; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
19-
; CHECK-NEXT: [[RDX_NEXT_1:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
22+
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_1]], [[L_1]]
2023
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
2124
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
2225
; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
23-
; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_NEXT_1]], [[L_2]]
26+
; CHECK-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_NEXT_1]], [[L_2]]
2427
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
2528
; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
2629
; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
27-
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_24]]
30+
; CHECK-NEXT: [[RDX_NEXT_24]] = add i32 [[RDX_3]], [[L_24]]
2831
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
2932
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
3033
; CHECK: [[EXIT]]:
31-
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
34+
; CHECK-NEXT: [[RDX_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_NEXT_24]], %[[LOOP]] ]
35+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_3]], [[RDX_NEXT]]
36+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
37+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = add i32 [[RDX_NEXT_24]], [[BIN_RDX1]]
3238
; CHECK-NEXT: ret i32 [[RDX_NEXT_LCSSA]]
3339
;
3440
entry:
@@ -203,33 +209,39 @@ define i32 @test_add_and_mul_reduction(ptr %src, i64 %n, i32 %start) {
203209
; CHECK-NEXT: br label %[[LOOP:.*]]
204210
; CHECK: [[LOOP]]:
205211
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
206-
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT_3:%.*]], %[[LOOP]] ]
212+
; CHECK-NEXT: [[RDX_1_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_1:%.*]], %[[LOOP]] ]
213+
; CHECK-NEXT: [[RDX_1_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_2:%.*]], %[[LOOP]] ]
214+
; CHECK-NEXT: [[RDX_1_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_1_NEXT_24:%.*]], %[[LOOP]] ]
215+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_1_NEXT:%.*]], %[[LOOP]] ]
207216
; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_2_NEXT_3:%.*]], %[[LOOP]] ]
208217
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
209218
; CHECK-NEXT: [[GEP_SRC:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV]]
210219
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 1
211-
; CHECK-NEXT: [[RDX_1_NEXT:%.*]] = add i32 [[RDX_1]], [[L]]
220+
; CHECK-NEXT: [[RDX_1_NEXT]] = add i32 [[RDX_1]], [[L]]
212221
; CHECK-NEXT: [[RDX_2_NEXT:%.*]] = mul i32 [[RDX_2]], [[L]]
213222
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
214223
; CHECK-NEXT: [[GEP_SRC_1:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT]]
215224
; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_SRC_1]], align 1
216-
; CHECK-NEXT: [[RDX_1_2:%.*]] = add i32 [[RDX_1_NEXT]], [[L_1]]
225+
; CHECK-NEXT: [[RDX_1_NEXT_1]] = add i32 [[RDX_1_1]], [[L_1]]
217226
; CHECK-NEXT: [[RDX_2_2:%.*]] = mul i32 [[RDX_2_NEXT]], [[L_1]]
218227
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
219228
; CHECK-NEXT: [[GEP_SRC_2:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_1]]
220229
; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_SRC_2]], align 1
221-
; CHECK-NEXT: [[RDX_1_NEXT_2:%.*]] = add i32 [[RDX_1_2]], [[L_2]]
230+
; CHECK-NEXT: [[RDX_1_NEXT_2]] = add i32 [[RDX_1_2]], [[L_2]]
222231
; CHECK-NEXT: [[RDX_2_NEXT_2:%.*]] = mul i32 [[RDX_2_2]], [[L_2]]
223232
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
224233
; CHECK-NEXT: [[GEP_SRC_24:%.*]] = getelementptr i32, ptr [[SRC]], i64 [[IV_NEXT_2]]
225234
; CHECK-NEXT: [[L_24:%.*]] = load i32, ptr [[GEP_SRC_24]], align 1
226-
; CHECK-NEXT: [[RDX_1_NEXT_3]] = add i32 [[RDX_1_NEXT_2]], [[L_24]]
235+
; CHECK-NEXT: [[RDX_1_NEXT_24]] = add i32 [[RDX_1_3]], [[L_24]]
227236
; CHECK-NEXT: [[RDX_2_NEXT_3]] = mul i32 [[RDX_2_NEXT_2]], [[L_24]]
228237
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
229238
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
230239
; CHECK: [[EXIT]]:
231-
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_1_NEXT_3]], %[[LOOP]] ]
240+
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA1:%.*]] = phi i32 [ [[RDX_1_NEXT_24]], %[[LOOP]] ]
232241
; CHECK-NEXT: [[BIN_RDX5:%.*]] = phi i32 [ [[RDX_2_NEXT_3]], %[[LOOP]] ]
242+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_1_NEXT_1]], [[RDX_1_NEXT]]
243+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_1_NEXT_2]], [[BIN_RDX]]
244+
; CHECK-NEXT: [[RDX_1_NEXT_LCSSA:%.*]] = add i32 [[RDX_1_NEXT_24]], [[BIN_RDX1]]
233245
; CHECK-NEXT: [[RES:%.*]] = add i32 [[RDX_1_NEXT_LCSSA]], [[BIN_RDX5]]
234246
; CHECK-NEXT: ret i32 [[RES]]
235247
;
@@ -509,20 +521,26 @@ define i32 @test_add_with_call(i64 %n, i32 %start) {
509521
; CHECK-NEXT: br label %[[LOOP:.*]]
510522
; CHECK: [[LOOP]]:
511523
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
512-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
524+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
525+
; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
526+
; CHECK-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
527+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
513528
; CHECK-NEXT: [[L:%.*]] = call i32 @foo()
514-
; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
529+
; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
515530
; CHECK-NEXT: [[L_1:%.*]] = call i32 @foo()
516-
; CHECK-NEXT: [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
531+
; CHECK-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[L_1]]
517532
; CHECK-NEXT: [[L_2:%.*]] = call i32 @foo()
518-
; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
533+
; CHECK-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[L_2]]
519534
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
520535
; CHECK-NEXT: [[L_3:%.*]] = call i32 @foo()
521-
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
536+
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[L_3]]
522537
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
523538
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
524539
; CHECK: [[EXIT]]:
525-
; CHECK-NEXT: [[BIN_RDX2:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
540+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
541+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
542+
; CHECK-NEXT: [[BIN_RDX1:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
543+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX1]]
526544
; CHECK-NEXT: ret i32 [[BIN_RDX2]]
527545
;
528546
entry:
@@ -550,35 +568,41 @@ define i32 @test_add_with_backward_dep(ptr %p, i64 %n, i32 %start) {
550568
; CHECK-NEXT: br label %[[LOOP:.*]]
551569
; CHECK: [[LOOP]]:
552570
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT_3:%.*]], %[[LOOP]] ]
553-
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
571+
; CHECK-NEXT: [[RDX_1:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_1:%.*]], %[[LOOP]] ]
572+
; CHECK-NEXT: [[RDX_2:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_2:%.*]], %[[LOOP]] ]
573+
; CHECK-NEXT: [[RDX_3:%.*]] = phi i32 [ 0, %[[ENTRY]] ], [ [[RDX_NEXT_3:%.*]], %[[LOOP]] ]
574+
; CHECK-NEXT: [[RDX:%.*]] = phi i32 [ [[START]], %[[ENTRY]] ], [ [[RDX_NEXT:%.*]], %[[LOOP]] ]
554575
; CHECK-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
555576
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV]]
556577
; CHECK-NEXT: [[L:%.*]] = load i32, ptr [[GEP]], align 4
557578
; CHECK-NEXT: [[GEP_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
558579
; CHECK-NEXT: store i32 0, ptr [[GEP_1]], align 4
559-
; CHECK-NEXT: [[RDX_NEXT:%.*]] = add i32 [[RDX]], [[L]]
580+
; CHECK-NEXT: [[RDX_NEXT]] = add i32 [[RDX]], [[L]]
560581
; CHECK-NEXT: [[IV_NEXT_1:%.*]] = add nuw nsw i64 [[IV]], 2
561582
; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT]]
562583
; CHECK-NEXT: [[L_1:%.*]] = load i32, ptr [[GEP_11]], align 4
563584
; CHECK-NEXT: [[GEP_1_1:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
564585
; CHECK-NEXT: store i32 0, ptr [[GEP_1_1]], align 4
565-
; CHECK-NEXT: [[RDX_2:%.*]] = add i32 [[RDX_NEXT]], [[L_1]]
586+
; CHECK-NEXT: [[RDX_NEXT_1]] = add i32 [[RDX_1]], [[L_1]]
566587
; CHECK-NEXT: [[IV_NEXT_2:%.*]] = add nuw nsw i64 [[IV]], 3
567588
; CHECK-NEXT: [[GEP_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_1]]
568589
; CHECK-NEXT: [[L_2:%.*]] = load i32, ptr [[GEP_2]], align 4
569590
; CHECK-NEXT: [[GEP_1_2:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
570591
; CHECK-NEXT: store i32 0, ptr [[GEP_1_2]], align 4
571-
; CHECK-NEXT: [[RDX_NEXT_2:%.*]] = add i32 [[RDX_2]], [[L_2]]
592+
; CHECK-NEXT: [[RDX_NEXT_2]] = add i32 [[RDX_2]], [[L_2]]
572593
; CHECK-NEXT: [[IV_NEXT_3]] = add nuw nsw i64 [[IV]], 4
573594
; CHECK-NEXT: [[GEP_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_2]]
574595
; CHECK-NEXT: [[L_3:%.*]] = load i32, ptr [[GEP_3]], align 4
575596
; CHECK-NEXT: [[GEP_1_3:%.*]] = getelementptr inbounds nuw i32, ptr [[P]], i64 [[IV_NEXT_3]]
576597
; CHECK-NEXT: store i32 0, ptr [[GEP_1_3]], align 4
577-
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_NEXT_2]], [[L_3]]
598+
; CHECK-NEXT: [[RDX_NEXT_3]] = add i32 [[RDX_3]], [[L_3]]
578599
; CHECK-NEXT: [[EC_3:%.*]] = icmp ne i64 [[IV_NEXT_3]], 1000
579600
; CHECK-NEXT: br i1 [[EC_3]], label %[[LOOP]], label %[[EXIT:.*]]
580601
; CHECK: [[EXIT]]:
581-
; CHECK-NEXT: [[BIN_RDX3:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
602+
; CHECK-NEXT: [[RDX_NEXT_LCSSA:%.*]] = phi i32 [ [[RDX_NEXT_3]], %[[LOOP]] ]
603+
; CHECK-NEXT: [[BIN_RDX:%.*]] = add i32 [[RDX_NEXT_1]], [[RDX_NEXT]]
604+
; CHECK-NEXT: [[BIN_RDX2:%.*]] = add i32 [[RDX_NEXT_2]], [[BIN_RDX]]
605+
; CHECK-NEXT: [[BIN_RDX3:%.*]] = add i32 [[RDX_NEXT_3]], [[BIN_RDX2]]
582606
; CHECK-NEXT: ret i32 [[BIN_RDX3]]
583607
;
584608
entry:

0 commit comments

Comments
 (0)