Skip to content

Commit 3df74b7

Browse files
committed
!fixup address latest comments, reorder to handle more IVs.
1 parent 557a5b9 commit 3df74b7

File tree

2 files changed

+28
-23
lines changed

2 files changed

+28
-23
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 23 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10368,22 +10368,6 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
1036810368
VPlanTransforms::runPass(VPlanTransforms::removeDeadRecipes, MainPlan);
1036910369

1037010370
using namespace VPlanPatternMatch;
10371-
VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10372-
VPValue *VectorTC = &MainPlan.getVectorTripCount();
10373-
// If there is a suitable resume value for the canonical induction in the
10374-
// scalar (which will become vector) epilogue loop we are done. Otherwise
10375-
// create it below.
10376-
if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10377-
return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10378-
m_Specific(VectorTC), m_SpecificInt(0)));
10379-
}))
10380-
return;
10381-
VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10382-
ScalarPHBuilder.createNaryOp(
10383-
VPInstruction::ResumePhi,
10384-
{VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10385-
"vec.epilog.resume.val");
10386-
1038710371
// When vectorizing the epilogue, FindLastIV reductions can introduce multiple
1038810372
// uses of undef/poison. If the reduction start value may be undef or poison
1038910373
// it needs to be frozen and the frozen start has to be used when computing
@@ -10413,6 +10397,22 @@ static void preparePlanForMainVectorLoop(VPlan &MainPlan, VPlan &EpiPlan) {
1041310397
};
1041410398
AddFreezeForFindLastIVReductions(MainPlan, true);
1041510399
AddFreezeForFindLastIVReductions(EpiPlan, false);
10400+
10401+
VPBasicBlock *MainScalarPH = MainPlan.getScalarPreheader();
10402+
VPValue *VectorTC = &MainPlan.getVectorTripCount();
10403+
// If there is a suitable resume value for the canonical induction in the
10404+
// scalar (which will become vector) epilogue loop we are done. Otherwise
10405+
// create it below.
10406+
if (any_of(*MainScalarPH, [VectorTC](VPRecipeBase &R) {
10407+
return match(&R, m_VPInstruction<VPInstruction::ResumePhi>(
10408+
m_Specific(VectorTC), m_SpecificInt(0)));
10409+
}))
10410+
return;
10411+
VPBuilder ScalarPHBuilder(MainScalarPH, MainScalarPH->begin());
10412+
ScalarPHBuilder.createNaryOp(
10413+
VPInstruction::ResumePhi,
10414+
{VectorTC, MainPlan.getCanonicalIV()->getStartValue()}, {},
10415+
"vec.epilog.resume.val");
1041610416
}
1041710417

1041810418
/// Prepare \p Plan for vectorizing the epilogue loop. That is, re-use expanded
@@ -10521,20 +10521,24 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1052110521
cast<VPHeaderPHIRecipe>(&R)->setStartValue(StartVal);
1052210522
}
1052310523

10524-
// Re-use the trip count and steps expanded for the main loop, as
10525-
// skeleton creation needs it as a value that dominates both the scalar
10526-
// and vector epilogue loops
10524+
// For some VPValues in the epilogue plan we must re-use the generated IR
10525+
// values from the main plan. Replace them with live-in VPValues.
1052710526
// TODO: This is a workaround needed for epilogue vectorization and it
1052810527
// should be removed once induction resume value creation is done
1052910528
// directly in VPlan.
1053010529
for (auto &R : make_early_inc_range(*Plan.getEntry())) {
10530+
// Re-use frozen values from the main plan for Freeze VPInstructions in the
10531+
// epilogue plan. This ensures all users use the same frozen value.
1053110532
auto *VPI = dyn_cast<VPInstruction>(&R);
1053210533
if (VPI && VPI->getOpcode() == Instruction::Freeze) {
1053310534
VPI->replaceAllUsesWith(Plan.getOrAddLiveIn(
1053410535
ToFrozen.lookup(VPI->getOperand(0)->getLiveInIRValue())));
1053510536
continue;
1053610537
}
1053710538

10539+
// Re-use the trip count and steps expanded for the main loop, as
10540+
// skeleton creation needs it as a value that dominates both the scalar
10541+
// and vector epilogue loops
1053810542
auto *ExpandR = dyn_cast<VPExpandSCEVRecipe>(&R);
1053910543
if (!ExpandR)
1054010544
continue;

llvm/test/Transforms/LoopVectorize/AArch64/epilog-iv-select-cmp.ll

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -129,6 +129,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
129129
; CHECK-NEXT: [[N_POS:%.*]] = icmp sgt i32 [[N]], 0
130130
; CHECK-NEXT: call void @llvm.assume(i1 [[N_POS]])
131131
; CHECK-NEXT: [[N_EXT:%.*]] = zext i32 [[N]] to i64
132+
; CHECK-NEXT: [[FR:%.*]] = freeze i32 [[START]]
132133
; CHECK-NEXT: [[TMP0:%.*]] = add nuw nsw i64 [[N_EXT]], 1
133134
; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 4
134135
; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
@@ -167,7 +168,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
167168
; CHECK-NEXT: [[RDX_MINMAX6:%.*]] = call <4 x i32> @llvm.smax.v4i32(<4 x i32> [[RDX_MINMAX5]], <4 x i32> [[TMP6]])
168169
; CHECK-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[RDX_MINMAX6]])
169170
; CHECK-NEXT: [[RDX_SELECT_CMP:%.*]] = icmp ne i32 [[TMP8]], -2147483648
170-
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[START]]
171+
; CHECK-NEXT: [[RDX_SELECT:%.*]] = select i1 [[RDX_SELECT_CMP]], i32 [[TMP8]], i32 [[FR]]
171172
; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
172173
; CHECK-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
173174
; CHECK: [[VEC_EPILOG_ITER_CHECK]]:
@@ -176,8 +177,8 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
176177
; CHECK-NEXT: br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
177178
; CHECK: [[VEC_EPILOG_PH]]:
178179
; CHECK-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
179-
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[START]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
180-
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[START]]
180+
; CHECK-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[RDX_SELECT]], %[[VEC_EPILOG_ITER_CHECK]] ], [ [[FR]], %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
181+
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i32 [[BC_MERGE_RDX]], [[FR]]
181182
; CHECK-NEXT: [[TMP10:%.*]] = select i1 [[TMP9]], i32 -2147483648, i32 [[BC_MERGE_RDX]]
182183
; CHECK-NEXT: [[N_MOD_VF7:%.*]] = urem i64 [[TMP0]], 4
183184
; CHECK-NEXT: [[N_VEC8:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF7]]
@@ -204,7 +205,7 @@ define i32 @select_icmp_var_start_iv_trunc(i32 %N, i32 %start) #0 {
204205
; CHECK: [[VEC_EPILOG_MIDDLE_BLOCK]]:
205206
; CHECK-NEXT: [[TMP16:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP14]])
206207
; CHECK-NEXT: [[RDX_SELECT_CMP18:%.*]] = icmp ne i32 [[TMP16]], -2147483648
207-
; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[START]]
208+
; CHECK-NEXT: [[RDX_SELECT19:%.*]] = select i1 [[RDX_SELECT_CMP18]], i32 [[TMP16]], i32 [[FR]]
208209
; CHECK-NEXT: [[CMP_N20:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
209210
; CHECK-NEXT: br i1 [[CMP_N20]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
210211
; CHECK: [[VEC_EPILOG_SCALAR_PH]]:

0 commit comments

Comments
 (0)