Skip to content

Commit cfbd563

Browse files
fhahnpawosm-arm
authored andcommitted
[LV] Use frozen start value for FindLastIV if needed. (#132691)
FindLastIV introduces multiple uses of the start value, where in the original source there was only a single use, when the epilogue is vectorized. Each use of undef may produce a different result, so introducing multiple uses can produce incorrect results when the input is undef/poison. If the start value may be undef or poison, freeze it and use the frozen value, which will be the same at all uses. See the following scenarios in Alive2: * Both main and epilogue vector loops execute, go to exit block: https://alive2.llvm.org/ce/z/_TSvRr * Both main and epilogue vector loops execute, go to scalar loop: https://alive2.llvm.org/ce/z/CsPj5v * Only epilogue vector loop executes, go to exit block: https://alive2.llvm.org/ce/z/5XqkNV * Only epilogue vector loop executes, go to scalar loop: https://alive2.llvm.org/ce/z/JUpqRN The latter 2 show requiring freezing the resume phi. That means we cannot freeze in the preheader. We could move the freeze to the main iteration count check, but that would be a bit fragile to find and other transforms can sink the freeze if needed. Depends on #132689 and #132690. Fixes #126836 PR: #132691
1 parent e8d8800 commit cfbd563

File tree

4 files changed

+95
-36
lines changed

4 files changed

+95
-36
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 73 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -7536,14 +7536,17 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
75367536
} else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(
75377537
RdxDesc.getRecurrenceKind())) {
75387538
using namespace llvm::PatternMatch;
7539-
Value *Cmp, *OrigResumeV;
7539+
Value *Cmp, *OrigResumeV, *CmpOp;
75407540
bool IsExpectedPattern =
75417541
match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)),
75427542
m_Specific(RdxDesc.getSentinelValue()),
75437543
m_Value(OrigResumeV))) &&
7544-
match(Cmp,
7545-
m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7546-
m_Specific(RdxDesc.getRecurrenceStartValue())));
7544+
(match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV),
7545+
m_Value(CmpOp))) &&
7546+
(match(CmpOp,
7547+
m_Freeze(m_Specific(RdxDesc.getRecurrenceStartValue()))) ||
7548+
(CmpOp == RdxDesc.getRecurrenceStartValue() &&
7549+
isGuaranteedNotToBeUndefOrPoison(CmpOp))));
75477550
assert(IsExpectedPattern && "Unexpected reduction resume pattern");
75487551
(void)IsExpectedPattern;
75497552
MainResumeValue = OrigResumeV;
@@ -10125,6 +10128,36 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
1012510128
VPlanTransforms::removeDeadRecipes(MainPlan);
1012610129

1012710130
using namespace VPlanPatternMatch;
10131+
// When vectorizing the epilogue, FindLastIV reductions can introduce multiple
10132+
// uses of undef/poison. If the reduction start value may be undef or poison
10133+
// it needs to be frozen and the frozen start has to be used when computing
10134+
// the reduction result. We also need to use the frozen value in the resume
10135+
// phi generated by the main vector loop, as this is also used to compute the
10136+
// reduction result after the epilogue vector loop.
10137+
auto AddFreezeForFindLastIVReductions = [](VPlan &Plan,
10138+
bool UpdateResumePhis) {
10139+
VPBuilder Builder(Plan.getEntry());
10140+
for (VPRecipeBase &R : *Plan.getMiddleBlock()) {
10141+
auto *VPI = dyn_cast<VPInstruction>(&R);
10142+
if (!VPI || VPI->getOpcode() != VPInstruction::ComputeFindLastIVResult)
10143+
continue;
10144+
VPValue *OrigStart = VPI->getOperand(1);
10145+
if (isGuaranteedNotToBeUndefOrPoison(OrigStart->getLiveInIRValue()))
10146+
continue;
10147+
VPInstruction *Freeze =
10148+
Builder.createNaryOp(Instruction::Freeze, {OrigStart}, {}, "fr");
10149+
VPI->setOperand(1, Freeze);
10150+
if (UpdateResumePhis)
10151+
OrigStart->replaceUsesWithIf(Freeze, [Freeze](VPUser &U, unsigned) {
10152+
return Freeze != &U && isa<VPInstruction>(&U) &&
10153+
cast<VPInstruction>(&U)->getOpcode() ==
10154+
VPInstruction::ResumePhi;
10155+
});
10156+
}
10157+
};
10158+
AddFreezeForFindLastIVReductions(MainPlan, true);
10159+
AddFreezeForFindLastIVReductions(EpiPlan, false);
10160+
1012810161
VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
1012910162
VPValue *VectorTC = &MainPlan.getVectorTripCount();
1013010163
// If there is a suitable resume value for the canonical induction in the
@@ -10152,24 +10185,7 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1015210185
VPBasicBlock *Header = VectorLoop->getEntryBasicBlock();
1015310186
Header->setName("vec.epilog.vector.body");
1015410187

10155-
// Re-use the trip count and steps expanded for the main loop, as
10156-
// skeleton creation needs it as a value that dominates both the scalar
10157-
// and vector epilogue loops
10158-
// TODO: This is a workaround needed for epilogue vectorization and it
10159-
// should be removed once induction resume value creation is done
10160-
// directly in VPlan.
10161-
for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10162-
auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10163-
if (!ExpandR)
10164-
continue;
10165-
auto *ExpandedVal =
10166-
Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10167-
ExpandR->replaceAllUsesWith(ExpandedVal);
10168-
if (Plan.getTripCount() == ExpandR)
10169-
Plan.resetTripCount(ExpandedVal);
10170-
ExpandR->eraseFromParent();
10171-
}
10172-
10188+
DenseMap<Value *, Value *> ToFrozen;
1017310189
// Ensure that the start values for all header phi recipes are updated before
1017410190
// vectorizing the epilogue loop.
1017510191
for (VPRecipeBase &R : Header->phis()) {
@@ -10235,6 +10251,10 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1023510251
ResumeV =
1023610252
Builder.CreateICmpNE(ResumeV, RdxDesc.getRecurrenceStartValue());
1023710253
} else if (RecurrenceDescriptor::isFindLastIVRecurrenceKind(RK)) {
10254+
ToFrozen[RdxDesc.getRecurrenceStartValue()] =
10255+
cast<PHINode>(ResumeV)->getIncomingValueForBlock(
10256+
EPI.MainLoopIterationCountCheck);
10257+
1023810258
// VPReductionPHIRecipe for FindLastIV reductions requires an adjustment
1023910259
// to the resume value. The resume value is adjusted to the sentinel
1024010260
// value when the final value from the main vector loop equals the start
@@ -10243,8 +10263,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1024310263
// variable.
1024410264
BasicBlock *ResumeBB = cast<Instruction>(ResumeV)->getParent();
1024510265
IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt());
10246-
Value *Cmp =
10247-
Builder.CreateICmpEQ(ResumeV, RdxDesc.getRecurrenceStartValue());
10266+
Value *Cmp = Builder.CreateICmpEQ(
10267+
ResumeV, ToFrozen[RdxDesc.getRecurrenceStartValue()]);
1024810268
ResumeV =
1024910269
Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV);
1025010270
}
@@ -10260,6 +10280,35 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1026010280
VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV);
1026110281
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
1026210282
}
10283+
10284+
// For some VPValues in the epilogue plan we must re-use the generated IR
10285+
// values from the main plan. Replace them with live-in VPValues.
10286+
// TODO: This is a workaround needed for epilogue vectorization and it
10287+
// should be removed once induction resume value creation is done
10288+
// directly in VPlan.
10289+
for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10290+
// Re-use frozen values from the main plan for Freeze VPInstructions in the
10291+
// epilogue plan. This ensures all users use the same frozen value.
10292+
auto *VPI = dyn_cast<VPInstruction>(&R);
10293+
if (VPI && VPI->getOpcode() == Instruction::Freeze) {
10294+
VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
10295+
ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
10296+
continue;
10297+
}
10298+
10299+
// Re-use the trip count and steps expanded for the main loop, as
10300+
// skeleton creation needs it as a value that dominates both the scalar
10301+
// and vector epilogue loops
10302+
auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
10303+
if (!ExpandR)
10304+
continue;
10305+
auto *ExpandedVal =
10306+
Plan.getOrAddLiveIn(ExpandedSCEVs.find(ExpandR->getSCEV())->second);
10307+
ExpandR->replaceAllUsesWith(ExpandedVal);
10308+
if (Plan.getTripCount() == ExpandR)
10309+
Plan.resetTripCount(ExpandedVal);
10310+
ExpandR->eraseFromParent();
10311+
}
1026310312
}
1026410313

1026510314
// Generate bypass values from the additional bypass block. Note that when the

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -406,6 +406,7 @@ bool VPInstruction::canGenerateScalarForFirstLane() const {
406406
if (isSingleScalar() || isVectorToScalar())
407407
return true;
408408
switch (Opcode) {
409+
case Instruction::Freeze:
409410
case Instruction::ICmp:
410411
case Instruction::Select:
411412
case VPInstruction::BranchOnCond:
@@ -450,6 +451,10 @@ Value *VPInstruction::generate(VPTransformState &State) {
450451
Value *A = State.get(getOperand(0));
451452
return Builder.CreateNot(A, Name);
452453
}
454+
case Instruction::Freeze: {
455+
Value *Op = State.get(getOperand(0), vputils::onlyFirstLaneUsed(this));
456+
return Builder.CreateFreeze(Op, Name);
457+
}
453458
case Instruction::ICmp: {
454459
bool OnlyFirstLaneUsed = vputils::onlyFirstLaneUsed(this);
455460
Value *A = State.get(getOperand(0), OnlyFirstLaneUsed);
@@ -820,6 +825,7 @@ bool VPInstruction::opcodeMayReadOrWriteFromMemory() const {
820825
if (Instruction::isBinaryOp(getOpcode()))
821826
return false;
822827
switch (getOpcode()) {
828+
case Instruction::Freeze:
823829
case Instruction::ICmp:
824830
case Instruction::Select:
825831
case VPInstruction::AnyOf:
@@ -848,6 +854,7 @@ bool VPInstruction::onlyFirstLaneUsed(const VPValue *Op) const {
848854
case Instruction::Select:
849855
case Instruction::Or:
850856
case VPInstruction::PtrAdd:
857+
case Instruction::Freeze:
851858
// TODO: Cover additional opcodes.
852859
return vputils::onlyFirstLaneUsed(this);
853860
case VPInstruction::ActiveLaneMask:

llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll

Lines changed: 10 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
99
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
1010
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
1111
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 8
12+
; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]]
1213
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
1314
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
1415
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 32
@@ -42,7 +43,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
4243
; CHECK-NEXT: [[RDX_MINMAX:%.*]] = call <16 x i8> @llvm.smax.v16i8(<16 x i8> [[TMP10]], <16 x i8> [[TMP11]])
4344
; CHECK-NEXT: [[TMP13:%.*]] = call i8 @llvm.vector.reduce.smax.v16i8(<16 x i8> [[RDX_MINMAX]])
4445
; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP13]], -128
45-
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[START]]
46+
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP13]], i8 [[FR]]
4647
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
4748
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
4849
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -53,8 +54,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
5354
; CHECK: [[VEC_EPILOG_PH]]:
5455
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
5556
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
56-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
57-
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]]
57+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
58+
; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]]
5859
; CHECK-NEXT: [[TMP15:%.*]] = select i1 [[TMP14]], i8 -128, i8 [[BC_MERGE_RDX]]
5960
; CHECK-NEXT: [[N_MOD_VF4:%.*]] = urem i32 [[TMP2]], 8
6061
; CHECK-NEXT: [[N_VEC5:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF4]]
@@ -82,7 +83,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
8283
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
8384
; CHECK-NEXT: [[TMP22:%.*]] = call i8 @llvm.vector.reduce.smax.v8i8(<8 x i8> [[TMP20]])
8485
; CHECK-NEXT: [[RDX_SELECT_CMP14:%.*]] = icmp ne i8 [[TMP22]], -128
85-
; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[START]]
86+
; CHECK-NEXT: [[RDX_SELECT15:%.*]] = select i1 [[RDX_SELECT_CMP14]], i8 [[TMP22]], i8 [[FR]]
8687
; CHECK-NEXT: [[CMP_N16:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC5]]
8788
; CHECK-NEXT: br i1 [[CMP_N16]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
8889
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:
@@ -128,6 +129,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
128129
; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0
129130
; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]])
130131
; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64
132+
; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[START]]
131133
; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1
132134
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
133135
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
@@ -166,7 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
166168
; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]])
167169
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]])
168170
; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648
169-
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[START]]
171+
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[FR]]
170172
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
171173
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
172174
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -175,8 +177,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
175177
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
176178
; CHECK: [[VEC_EPILOG_PH]]:
177179
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
178-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
179-
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[START]]
180+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
181+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[FR]]
180182
; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]]
181183
; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4
182184
; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]]
@@ -203,7 +205,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
203205
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
204206
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]])
205207
; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648
206-
; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[START]]
208+
; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[FR]]
207209
; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
208210
; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
209211
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:

llvm/test/Transforms/LoopVectorize/epilog-iv-select-cmp.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -221,6 +221,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
221221
; CHECK-NEXT: [[TMP1:%.*]] = zext i8 [[TMP0]] to i32
222222
; CHECK-NEXT: [[TMP2:%.*]] = add nuw nsw i32 [[TMP1]], 1
223223
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[TMP2]], 4
224+
; CHECK-NEXT: [[FR:%.*]] = freeze i8 [[START]]
224225
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
225226
; CHECK: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
226227
; CHECK-NEXT: [[MIN_ITERS_CHECK1:%.*]] = icmp ult i32 [[TMP2]], 4
@@ -247,7 +248,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
247248
; CHECK: [[MIDDLE_BLOCK]]:
248249
; CHECK-NEXT: [[TMP10:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP8]])
249250
; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i8 [[TMP10]], -128
250-
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[START]]
251+
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i8 [[TMP10]], i8 [[FR]]
251252
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC]]
252253
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
253254
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -258,8 +259,8 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
258259
; CHECK: [[VEC_EPILOG_PH]]:
259260
; CHECK-NEXT: [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
260261
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i8 [ [[TMP3]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
261-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
262-
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[START]]
262+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i8 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
263+
; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i8 [[BC_MERGE_RDX]], [[FR]]
263264
; CHECK-NEXT: [[TMP12:%.*]] = select i1 [[TMP11]], i8 -128, i8 [[BC_MERGE_RDX]]
264265
; CHECK-NEXT: [[N_MOD_VF2:%.*]] = urem i32 [[TMP2]], 4
265266
; CHECK-NEXT: [[N_VEC3:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF2]]
@@ -287,7 +288,7 @@ define i8 @select_icmp_var_start(ptr %a, i8 %n, i8 %start) {
287288
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
288289
; CHECK-NEXT: [[TMP19:%.*]] = call i8 @llvm.vector.reduce.smax.v4i8(<4 x i8> [[TMP17]])
289290
; CHECK-NEXT: [[RDX_SELECT_CMP12:%.*]] = icmp ne i8 [[TMP19]], -128
290-
; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[START]]
291+
; CHECK-NEXT: [[RDX_SELECT13:%.*]] = select i1 [[RDX_SELECT_CMP12]], i8 [[TMP19]], i8 [[FR]]
291292
; CHECK-NEXT: [[CMP_N14:%.*]] = icmp eq i32 [[TMP2]], [[N_VEC3]]
292293
; CHECK-NEXT: br i1 [[CMP_N14]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
293294
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:

0 commit comments

Comments
 (0)