Skip to content

Commit 7f44a97

Browse files
author
git apple-llvm automerger
committed
Merge commit '0d3ba087f706' from llvm.org/main into next
2 parents bd6609e + 0d3ba08 commit 7f44a97

File tree

3 files changed

+69
-107
lines changed

3 files changed

+69
-107
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -446,17 +446,15 @@ class LoopVectorizationPlanner {
446446
/// TODO: \p VectorizingEpilogue indicates if the executed VPlan is for the
447447
/// epilogue vector loop. It should be removed once the re-use issue has been
448448
/// fixed.
449-
/// \p ExpandedSCEVs is passed during execution of the plan for epilogue loop
450-
/// to re-use expansion results generated during main plan execution.
451449
///
452450
/// Returns a mapping of SCEVs to their expanded IR values.
453451
/// Note that this is a temporary workaround needed due to the current
454452
/// epilogue handling.
455-
DenseMap<const SCEV *, Value *>
456-
executePlan(ElementCount VF, unsigned UF, VPlan &BestPlan,
457-
InnerLoopVectorizer &LB, DominatorTree *DT,
458-
bool VectorizingEpilogue,
459-
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs = nullptr);
453+
DenseMap<const SCEV *, Value *> executePlan(ElementCount VF, unsigned UF,
454+
VPlan &BestPlan,
455+
InnerLoopVectorizer &LB,
456+
DominatorTree *DT,
457+
bool VectorizingEpilogue);
460458

461459
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
462460
void printPlans(raw_ostream &O);

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 57 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -503,11 +503,8 @@ class InnerLoopVectorizer {
503503
/// is generated around the vectorized (and scalar epilogue) loops consisting
504504
/// of various checks and bypasses. Return the pre-header block of the new
505505
/// loop. In the case of epilogue vectorization, this function is overriden to
506-
/// handle the more complex control flow around the loops. \p ExpandedSCEVs is
507-
/// used to look up SCEV expansions for expressions needed during skeleton
508-
/// creation.
509-
virtual BasicBlock *
510-
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs);
506+
/// handle the more complex control flow around the loops.
507+
virtual BasicBlock *createVectorizedLoopSkeleton();
511508

512509
/// Fix the vectorized code, taking care of header phi's, and more.
513510
void fixVectorizedLoop(VPTransformState &State);
@@ -535,12 +532,6 @@ class InnerLoopVectorizer {
535532
/// count of the original loop for both main loop and epilogue vectorization.
536533
void setTripCount(Value *TC) { TripCount = TC; }
537534

538-
// Retrieve the additional bypass value associated with an original
539-
/// induction header phi.
540-
Value *getInductionAdditionalBypassValue(PHINode *OrigPhi) const {
541-
return Induction2AdditionalBypassValue.at(OrigPhi);
542-
}
543-
544535
/// Return the additional bypass block which targets the scalar loop by
545536
/// skipping the epilogue loop after completing the main loop.
546537
BasicBlock *getAdditionalBypassBlock() const {
@@ -577,11 +568,6 @@ class InnerLoopVectorizer {
577568
/// vector loop preheader, middle block and scalar preheader.
578569
void createVectorLoopSkeleton(StringRef Prefix);
579570

580-
/// Create and record the values for induction variables to resume coming from
581-
/// the additional bypass block.
582-
void createInductionAdditionalBypassValues(const SCEV2ValueTy &ExpandedSCEVs,
583-
Value *MainVectorTripCount);
584-
585571
/// Allow subclasses to override and print debug traces before/after vplan
586572
/// execution, when trace information is requested.
587573
virtual void printDebugTracesAtStart() {}
@@ -671,11 +657,6 @@ class InnerLoopVectorizer {
671657
/// for cleaning the checks, if vectorization turns out unprofitable.
672658
GeneratedRTChecks &RTChecks;
673659

674-
/// Mapping of induction phis to their additional bypass values. They
675-
/// need to be added as operands to phi nodes in the scalar loop preheader
676-
/// after the epilogue skeleton has been created.
677-
DenseMap<PHINode *, Value *> Induction2AdditionalBypassValue;
678-
679660
/// The additional bypass block which conditionally skips over the epilogue
680661
/// loop after executing the main loop. Needed to resume inductions and
681662
/// reductions during epilogue vectorization.
@@ -738,16 +719,14 @@ class InnerLoopAndEpilogueVectorizer : public InnerLoopVectorizer {
738719

739720
// Override this function to handle the more complex control flow around the
740721
// three loops.
741-
BasicBlock *
742-
createVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final {
743-
return createEpilogueVectorizedLoopSkeleton(ExpandedSCEVs);
722+
BasicBlock *createVectorizedLoopSkeleton() final {
723+
return createEpilogueVectorizedLoopSkeleton();
744724
}
745725

746726
/// The interface for creating a vectorized skeleton using one of two
747727
/// different strategies, each corresponding to one execution of the vplan
748728
/// as described above.
749-
virtual BasicBlock *
750-
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) = 0;
729+
virtual BasicBlock *createEpilogueVectorizedLoopSkeleton() = 0;
751730

752731
/// Holds and updates state information required to vectorize the main loop
753732
/// and its epilogue in two separate passes. This setup helps us avoid
@@ -775,8 +754,7 @@ class EpilogueVectorizerMainLoop : public InnerLoopAndEpilogueVectorizer {
775754
EPI, LVL, CM, BFI, PSI, Check, Plan) {}
776755
/// Implements the interface for creating a vectorized skeleton using the
777756
/// *main loop* strategy (ie the first pass of vplan execution).
778-
BasicBlock *
779-
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
757+
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
780758

781759
protected:
782760
/// Emits an iteration count bypass check once for the main loop (when \p
@@ -806,8 +784,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer {
806784
}
807785
/// Implements the interface for creating a vectorized skeleton using the
808786
/// *epilogue loop* strategy (ie the second pass of vplan execution).
809-
BasicBlock *
810-
createEpilogueVectorizedLoopSkeleton(const SCEV2ValueTy &ExpandedSCEVs) final;
787+
BasicBlock *createEpilogueVectorizedLoopSkeleton() final;
811788

812789
protected:
813790
/// Emits an iteration count bypass check after the main vector loop has
@@ -2722,44 +2699,7 @@ static void addFullyUnrolledInstructionsToIgnore(
27222699
}
27232700
}
27242701

2725-
void InnerLoopVectorizer::createInductionAdditionalBypassValues(
2726-
const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount) {
2727-
assert(MainVectorTripCount && "Must have bypass information");
2728-
2729-
Instruction *OldInduction = Legal->getPrimaryInduction();
2730-
IRBuilder<> BypassBuilder(getAdditionalBypassBlock(),
2731-
getAdditionalBypassBlock()->getFirstInsertionPt());
2732-
for (const auto &InductionEntry : Legal->getInductionVars()) {
2733-
PHINode *OrigPhi = InductionEntry.first;
2734-
const InductionDescriptor &II = InductionEntry.second;
2735-
Value *Step = getExpandedStep(II, ExpandedSCEVs);
2736-
// For the primary induction the additional bypass end value is known.
2737-
// Otherwise it is computed.
2738-
Value *EndValueFromAdditionalBypass = MainVectorTripCount;
2739-
if (OrigPhi != OldInduction) {
2740-
auto *BinOp = II.getInductionBinOp();
2741-
// Fast-math-flags propagate from the original induction instruction.
2742-
if (isa_and_nonnull<FPMathOperator>(BinOp))
2743-
BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
2744-
2745-
// Compute the end value for the additional bypass.
2746-
EndValueFromAdditionalBypass =
2747-
emitTransformedIndex(BypassBuilder, MainVectorTripCount,
2748-
II.getStartValue(), Step, II.getKind(), BinOp);
2749-
EndValueFromAdditionalBypass->setName("ind.end");
2750-
}
2751-
2752-
// Store the bypass value here, as it needs to be added as operand to its
2753-
// scalar preheader phi node after the epilogue skeleton has been created.
2754-
// TODO: Directly add as extra operand to the VPResumePHI recipe.
2755-
assert(!Induction2AdditionalBypassValue.contains(OrigPhi) &&
2756-
"entry for OrigPhi already exits");
2757-
Induction2AdditionalBypassValue[OrigPhi] = EndValueFromAdditionalBypass;
2758-
}
2759-
}
2760-
2761-
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton(
2762-
const SCEV2ValueTy &ExpandedSCEVs) {
2702+
BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
27632703
/*
27642704
In this function we generate a new loop. The new loop will contain
27652705
the vectorized instructions while the old loop will continue to run the
@@ -7726,16 +7666,11 @@ static void fixReductionScalarResumeWhenVectorizingEpilog(
77267666

77277667
DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77287668
ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan,
7729-
InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue,
7730-
const DenseMap<const SCEV *, Value *> *ExpandedSCEVs) {
7669+
InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) {
77317670
assert(BestVPlan.hasVF(BestVF) &&
77327671
"Trying to execute plan with unsupported VF");
77337672
assert(BestVPlan.hasUF(BestUF) &&
77347673
"Trying to execute plan with unsupported UF");
7735-
assert(
7736-
((VectorizingEpilogue && ExpandedSCEVs) ||
7737-
(!VectorizingEpilogue && !ExpandedSCEVs)) &&
7738-
"expanded SCEVs to reuse can only be used during epilogue vectorization");
77397674
// TODO: Move to VPlan transform stage once the transition to the VPlan-based
77407675
// cost model is complete for better cost estimates.
77417676
VPlanTransforms::runPass(VPlanTransforms::unrollByUF, BestVPlan, BestUF,
@@ -7773,8 +7708,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
77737708
// middle block. The vector loop is created during VPlan execution.
77747709
VPBasicBlock *VectorPH =
77757710
cast<VPBasicBlock>(BestVPlan.getEntry()->getSingleSuccessor());
7776-
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton(
7777-
ExpandedSCEVs ? *ExpandedSCEVs : State.ExpandedSCEVs);
7711+
7712+
State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
77787713
if (VectorizingEpilogue)
77797714
VPlanTransforms::removeDeadRecipes(BestVPlan);
77807715

@@ -7815,8 +7750,8 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78157750
BestVPlan.execute(&State);
78167751

78177752
auto *MiddleVPBB = BestVPlan.getMiddleBlock();
7818-
// 2.5 When vectorizing the epilogue, fix reduction and induction resume
7819-
// values from the additional bypass block.
7753+
// 2.5 When vectorizing the epilogue, fix reduction resume values from the
7754+
// additional bypass block.
78207755
if (VectorizingEpilogue) {
78217756
assert(!ILV.Legal->hasUncountableEarlyExit() &&
78227757
"Epilogue vectorisation not yet supported with early exits");
@@ -7834,11 +7769,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
78347769
fixReductionScalarResumeWhenVectorizingEpilog(
78357770
&R, State, State.CFG.VPBB2IRBB[MiddleVPBB], BypassBlock);
78367771
}
7837-
for (const auto &[IVPhi, _] : Legal->getInductionVars()) {
7838-
auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
7839-
Value *V = ILV.getInductionAdditionalBypassValue(IVPhi);
7840-
Inc->setIncomingValueForBlock(BypassBlock, V);
7841-
}
78427772
}
78437773

78447774
// 2.6. Maintain Loop Hints
@@ -7900,8 +7830,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
79007830

79017831
/// This function is partially responsible for generating the control flow
79027832
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
7903-
BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton(
7904-
const SCEV2ValueTy &ExpandedSCEVs) {
7833+
BasicBlock *EpilogueVectorizerMainLoop::createEpilogueVectorizedLoopSkeleton() {
79057834
createVectorLoopSkeleton("");
79067835

79077836
// Generate the code to check the minimum iteration count of the vector
@@ -8011,8 +7940,7 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass,
80117940
/// This function is partially responsible for generating the control flow
80127941
/// depicted in https://llvm.org/docs/Vectorizers.html#epilogue-vectorization.
80137942
BasicBlock *
8014-
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
8015-
const SCEV2ValueTy &ExpandedSCEVs) {
7943+
EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() {
80167944
createVectorLoopSkeleton("vec.epilog.");
80177945

80187946
// Now, compare the remaining count and if there aren't enough iterations to
@@ -8080,11 +8008,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
80808008
Phi->removeIncomingValue(EPI.MemSafetyCheck);
80818009
}
80828010

8083-
// Generate bypass values from the additional bypass block. Note that when the
8084-
// vectorized epilogue is skipped due to iteration count check, then the
8085-
// resume value for the induction variable comes from the trip count of the
8086-
// main vector loop, passed as the second argument.
8087-
createInductionAdditionalBypassValues(ExpandedSCEVs, EPI.VectorTripCount);
80888011
return LoopVectorPreHeader;
80898012
}
80908013

@@ -10529,6 +10452,33 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L,
1052910452
}
1053010453
}
1053110454

10455+
// Generate bypass values from the additional bypass block. Note that when the
10456+
// vectorized epilogue is skipped due to iteration count check, then the
10457+
// resume value for the induction variable comes from the trip count of the
10458+
// main vector loop, passed as the second argument.
10459+
static Value *createInductionAdditionalBypassValues(
10460+
PHINode *OrigPhi, const InductionDescriptor &II, IRBuilder<> &BypassBuilder,
10461+
const SCEV2ValueTy &ExpandedSCEVs, Value *MainVectorTripCount,
10462+
Instruction *OldInduction) {
10463+
Value *Step = getExpandedStep(II, ExpandedSCEVs);
10464+
// For the primary induction the additional bypass end value is known.
10465+
// Otherwise it is computed.
10466+
Value *EndValueFromAdditionalBypass = MainVectorTripCount;
10467+
if (OrigPhi != OldInduction) {
10468+
auto *BinOp = II.getInductionBinOp();
10469+
// Fast-math-flags propagate from the original induction instruction.
10470+
if (isa_and_nonnull<FPMathOperator>(BinOp))
10471+
BypassBuilder.setFastMathFlags(BinOp->getFastMathFlags());
10472+
10473+
// Compute the end value for the additional bypass.
10474+
EndValueFromAdditionalBypass =
10475+
emitTransformedIndex(BypassBuilder, MainVectorTripCount,
10476+
II.getStartValue(), Step, II.getKind(), BinOp);
10477+
EndValueFromAdditionalBypass->setName("ind.end");
10478+
}
10479+
return EndValueFromAdditionalBypass;
10480+
}
10481+
1053210482
bool LoopVectorizePass::processLoop(Loop *L) {
1053310483
assert((EnableVPlanNativePath || L->isInnermost()) &&
1053410484
"VPlan-native path is not enabled. Only process inner loops.");
@@ -10912,7 +10862,21 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1091210862
preparePlanForEpilogueVectorLoop(BestEpiPlan, L, ExpandedSCEVs, EPI);
1091310863

1091410864
LVP.executePlan(EPI.EpilogueVF, EPI.EpilogueUF, BestEpiPlan, EpilogILV,
10915-
DT, true, &ExpandedSCEVs);
10865+
DT, true);
10866+
10867+
// Fix induction resume values from the additional bypass block.
10868+
BasicBlock *BypassBlock = EpilogILV.getAdditionalBypassBlock();
10869+
IRBuilder<> BypassBuilder(BypassBlock,
10870+
BypassBlock->getFirstInsertionPt());
10871+
BasicBlock *PH = L->getLoopPreheader();
10872+
for (const auto &[IVPhi, II] : LVL.getInductionVars()) {
10873+
auto *Inc = cast<PHINode>(IVPhi->getIncomingValueForBlock(PH));
10874+
Value *V = createInductionAdditionalBypassValues(
10875+
IVPhi, II, BypassBuilder, ExpandedSCEVs, EPI.VectorTripCount,
10876+
LVL.getPrimaryInduction());
10877+
// TODO: Directly add as extra operand to the VPResumePHI recipe.
10878+
Inc->setIncomingValueForBlock(BypassBlock, V);
10879+
}
1091610880
++LoopsEpilogueVectorized;
1091710881

1091810882
if (!MainILV.areSafetyChecksAdded())

llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -182,11 +182,11 @@ define void @_Z3fn1v() #0 {
182182
; CHECK-NEXT: [[DOTSPLATINSERT67:%.*]] = insertelement <8 x i64> poison, i64 [[BC_RESUME_VAL44]], i64 0
183183
; CHECK-NEXT: [[DOTSPLAT68:%.*]] = shufflevector <8 x i64> [[DOTSPLATINSERT67]], <8 x i64> poison, <8 x i32> zeroinitializer
184184
; CHECK-NEXT: [[INDUCTION69:%.*]] = add <8 x i64> [[DOTSPLAT68]], <i64 0, i64 2, i64 4, i64 6, i64 8, i64 10, i64 12, i64 14>
185-
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY51:%.*]]
186-
; CHECK: vec.epilog.vector.body51:
187-
; CHECK-NEXT: [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
188-
; CHECK-NEXT: [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
189-
; CHECK-NEXT: [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY51]] ]
185+
; CHECK-NEXT: br label [[VEC_EPILOG_VECTOR_BODY49:%.*]]
186+
; CHECK: vec.epilog.vector.body49:
187+
; CHECK-NEXT: [[INDEX61:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL51]], [[VEC_EPILOG_PH42]] ], [ [[INDEX_NEXT74:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
188+
; CHECK-NEXT: [[VEC_IND65:%.*]] = phi <8 x i64> [ [[INDUCTION64]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT66:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
189+
; CHECK-NEXT: [[VEC_IND70:%.*]] = phi <8 x i64> [ [[INDUCTION69]], [[VEC_EPILOG_PH42]] ], [ [[VEC_IND_NEXT71:%.*]], [[VEC_EPILOG_VECTOR_BODY49]] ]
190190
; CHECK-NEXT: [[TMP44:%.*]] = sub nsw <8 x i64> splat (i64 8), [[VEC_IND65]]
191191
; CHECK-NEXT: [[TMP45:%.*]] = getelementptr inbounds [10 x [10 x i32]], ptr @d, i64 0, <8 x i64> [[VEC_IND65]]
192192
; CHECK-NEXT: [[TMP46:%.*]] = add nsw <8 x i64> [[TMP44]], [[VEC_IND70]]
@@ -205,8 +205,8 @@ define void @_Z3fn1v() #0 {
205205
; CHECK-NEXT: [[VEC_IND_NEXT66]] = add <8 x i64> [[VEC_IND65]], splat (i64 16)
206206
; CHECK-NEXT: [[VEC_IND_NEXT71]] = add <8 x i64> [[VEC_IND70]], splat (i64 16)
207207
; CHECK-NEXT: [[TMP55:%.*]] = icmp eq i64 [[INDEX_NEXT74]], [[N_VEC53]]
208-
; CHECK-NEXT: br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY51]], !llvm.loop [[LOOP5:![0-9]+]]
209-
; CHECK: vec.epilog.middle.block64:
208+
; CHECK-NEXT: br i1 [[TMP55]], label [[VEC_EPILOG_MIDDLE_BLOCK40:%.*]], label [[VEC_EPILOG_VECTOR_BODY49]], !llvm.loop [[LOOP5:![0-9]+]]
209+
; CHECK: vec.epilog.middle.block62:
210210
; CHECK-NEXT: [[CMP_N65:%.*]] = icmp eq i64 [[TMP28]], [[N_VEC53]]
211211
; CHECK-NEXT: br i1 [[CMP_N65]], label [[FOR_COND_CLEANUP_LOOPEXIT]], label [[VEC_EPILOG_SCALAR_PH40]]
212212
; CHECK: vec.epilog.scalar.ph40:

0 commit comments

Comments
 (0)