@@ -1829,6 +1829,10 @@ class GeneratedRTChecks {
18291829 Loop *OuterLoop = nullptr ;
18301830
18311831public:
1832+ // / Set by VPlan when the vector loop should be entered even when runtime
1833+ // / checks determine that pointers alias within an iteration.
1834+ bool HasAliasMask = false ;
1835+
18321836 GeneratedRTChecks (ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
18331837 TargetTransformInfo *TTI, const DataLayout &DL,
18341838 bool AddBranchWeights)
@@ -1869,9 +1873,11 @@ class GeneratedRTChecks {
18691873
18701874 const auto &RtPtrChecking = *LAI.getRuntimePointerChecking ();
18711875 if (RtPtrChecking.Need ) {
1872- auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1873- MemCheckBlock = SplitBlock (Pred, Pred->getTerminator (), DT, LI, nullptr ,
1874- " vector.memcheck" );
1876+ if (!MemCheckBlock) {
1877+ auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1878+ MemCheckBlock = SplitBlock (Pred, Pred->getTerminator (), DT, LI, nullptr ,
1879+ " vector.memcheck" );
1880+ }
18751881
18761882 auto DiffChecks = RtPtrChecking.getDiffChecks ();
18771883 if (DiffChecks) {
@@ -1929,6 +1935,10 @@ class GeneratedRTChecks {
19291935 OuterLoop = L->getParentLoop ();
19301936 }
19311937
1938+ Value *expandCodeForMemCheck (const SCEV *Scev, Instruction *Loc) {
1939+ return MemCheckExp.expandCodeFor (Scev, Scev->getType (), Loc);
1940+ }
1941+
19321942 InstructionCost getCost () {
19331943 if (SCEVCheckBlock || MemCheckBlock)
19341944 LLVM_DEBUG (dbgs () << " Calculating cost of runtime checks:\n " );
@@ -2103,11 +2113,18 @@ class GeneratedRTChecks {
21032113 if (OuterLoop)
21042114 OuterLoop->addBasicBlockToLoop (MemCheckBlock, *LI);
21052115
2106- BranchInst &BI =
2107- *BranchInst::Create (Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2108- if (AddBranchWeights) {
2116+ // TODO: Branch to the vector preheader conditionally based on the number of
2117+ // non-aliasing elements. The scalar loop will likely be better if only one
2118+ // or two elements will be processed per vectorised loop iteration.
2119+
2120+ // Jump to the vector preheader unconditionally if it's safe to do so
2121+ // because an alias mask has been set up.
2122+ BranchInst &BI = HasAliasMask
2123+ ? *BranchInst::Create (LoopVectorPreHeader)
2124+ : *BranchInst::Create (Bypass, LoopVectorPreHeader,
2125+ MemRuntimeCheckCond);
2126+ if (!HasAliasMask && AddBranchWeights)
21092127 setBranchWeights (BI, MemCheckBypassWeights, /* IsExpected=*/ false );
2110- }
21112128 ReplaceInstWithInst (MemCheckBlock->getTerminator (), &BI);
21122129 MemCheckBlock->getTerminator ()->setDebugLoc (
21132130 Pred->getTerminator ()->getDebugLoc ());
@@ -2576,7 +2593,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
25762593 });
25772594 }
25782595
2579- LoopBypassBlocks.push_back (MemCheckBlock);
2596+ // / If an alias mask has been set up then we don't need the bypass as the
2597+ // / vector preheader will be branched to unconditionally
2598+ if (!RTChecks.HasAliasMask )
2599+ LoopBypassBlocks.push_back (MemCheckBlock);
25802600
25812601 AddedSafetyChecks = true ;
25822602
@@ -6885,7 +6905,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
68856905}
68866906
68876907std::optional<VectorizationFactor>
6888- LoopVectorizationPlanner::plan (ElementCount UserVF, unsigned UserIC) {
6908+ LoopVectorizationPlanner::plan (ElementCount UserVF, unsigned UserIC,
6909+ SmallVector<PointerDiffInfoValues> RTChecks,
6910+ bool &HasAliasMask) {
68896911 assert (OrigLoop->isInnermost () && " Inner loop expected." );
68906912 CM.collectValuesToIgnore ();
68916913 CM.collectElementTypesForWidening ();
@@ -6922,7 +6944,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
69226944 CM.collectInLoopReductions ();
69236945 if (CM.selectUserVectorizationFactor (UserVF)) {
69246946 LLVM_DEBUG (dbgs () << " LV: Using user VF " << UserVF << " .\n " );
6925- buildVPlansWithVPRecipes (UserVF, UserVF);
6947+ buildVPlansWithVPRecipes (UserVF, UserVF, RTChecks, HasAliasMask );
69266948 if (!hasPlanWithVF (UserVF)) {
69276949 LLVM_DEBUG (dbgs () << " LV: No VPlan could be built for " << UserVF
69286950 << " .\n " );
@@ -6956,8 +6978,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
69566978 CM.collectInstsToScalarize (VF);
69576979 }
69586980
6959- buildVPlansWithVPRecipes (ElementCount::getFixed (1 ), MaxFactors.FixedVF );
6960- buildVPlansWithVPRecipes (ElementCount::getScalable (1 ), MaxFactors.ScalableVF );
6981+ buildVPlansWithVPRecipes (ElementCount::getFixed (1 ), MaxFactors.FixedVF ,
6982+ RTChecks, HasAliasMask);
6983+ buildVPlansWithVPRecipes (ElementCount::getScalable (1 ), MaxFactors.ScalableVF ,
6984+ RTChecks, HasAliasMask);
69616985
69626986 LLVM_DEBUG (printPlans (dbgs ()));
69636987 if (VPlans.empty ())
@@ -7383,7 +7407,6 @@ LoopVectorizationPlanner::executePlan(
73837407 CanonicalIVStartValue, State);
73847408
73857409 BestVPlan.execute (&State);
7386-
73877410 // 2.5 Collect reduction resume values.
73887411 DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
73897412 auto *ExitVPBB =
@@ -7627,7 +7650,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
76277650 // reduction phis in the scalar loop preheader.
76287651 if (EPI.SCEVSafetyCheck )
76297652 LoopBypassBlocks.push_back (EPI.SCEVSafetyCheck );
7630- if (EPI.MemSafetyCheck )
7653+ if (EPI.MemSafetyCheck && !RTChecks. HasAliasMask )
76317654 LoopBypassBlocks.push_back (EPI.MemSafetyCheck );
76327655 LoopBypassBlocks.push_back (EPI.EpilogueIterationCountCheck );
76337656
@@ -7848,14 +7871,14 @@ void VPRecipeBuilder::createHeaderMask() {
78487871 // constructing the desired canonical IV in the header block as its first
78497872 // non-phi instructions.
78507873
7874+ VPValue *BlockMask = nullptr ;
78517875 VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion ()->getEntryBasicBlock ();
78527876 auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi ();
78537877 auto *IV = new VPWidenCanonicalIVRecipe (Plan.getCanonicalIV ());
78547878 HeaderVPBB->insert (IV, NewInsertionPoint);
78557879
78567880 VPBuilder::InsertPointGuard Guard (Builder);
78577881 Builder.setInsertPoint (HeaderVPBB, NewInsertionPoint);
7858- VPValue *BlockMask = nullptr ;
78597882 VPValue *BTC = Plan.getOrCreateBackedgeTakenCount ();
78607883 BlockMask = Builder.createICmp (CmpInst::ICMP_ULE, IV, BTC);
78617884 BlockMaskCache[Header] = BlockMask;
@@ -8350,14 +8373,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
83508373 return tryToWiden (Instr, Operands, VPBB);
83518374}
83528375
8353- void LoopVectorizationPlanner::buildVPlansWithVPRecipes (ElementCount MinVF,
8354- ElementCount MaxVF) {
8376+ void LoopVectorizationPlanner::buildVPlansWithVPRecipes (
8377+ ElementCount MinVF, ElementCount MaxVF,
8378+ SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask) {
83558379 assert (OrigLoop->isInnermost () && " Inner loop expected." );
83568380
83578381 auto MaxVFTimes2 = MaxVF * 2 ;
83588382 for (ElementCount VF = MinVF; ElementCount::isKnownLT (VF, MaxVFTimes2);) {
83598383 VFRange SubRange = {VF, MaxVFTimes2};
8360- if (auto Plan = tryToBuildVPlanWithVPRecipes (SubRange)) {
8384+ if (auto Plan =
8385+ tryToBuildVPlanWithVPRecipes (SubRange, RTChecks, HasAliasMask)) {
83618386 // Now optimize the initial VPlan.
83628387 if (!Plan->hasVF (ElementCount::getFixed (1 )))
83638388 VPlanTransforms::truncateToMinimalBitwidths (
@@ -8378,7 +8403,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
83788403// Add the necessary canonical IV and branch recipes required to control the
83798404// loop.
83808405static void addCanonicalIVRecipes (VPlan &Plan, Type *IdxTy, bool HasNUW,
8381- DebugLoc DL) {
8406+ DebugLoc DL, VPValue *AliasMask ) {
83828407 Value *StartIdx = ConstantInt::get (IdxTy, 0 );
83838408 auto *StartV = Plan.getOrAddLiveIn (StartIdx);
83848409
@@ -8389,9 +8414,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
83898414 Header->insert (CanonicalIVPHI, Header->begin ());
83908415
83918416 VPBuilder Builder (TopRegion->getExitingBasicBlock ());
8392- // Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8417+ // Add a VPInstruction to increment the scalar canonical IV by VF * UF, or the
8418+ // popcount of the alias mask if there is one
8419+ VPValue *IncrementBy = &Plan.getVFxUF ();
8420+ if (AliasMask) {
8421+ IncrementBy = Builder.createNaryOp (VPInstruction::PopCount, {AliasMask}, DL,
8422+ " popcount" );
8423+ auto *IVType = CanonicalIVPHI->getScalarType ();
8424+
8425+ if (IVType->getScalarSizeInBits () < 64 ) {
8426+ auto *Cast =
8427+ new VPScalarCastRecipe (Instruction::Trunc, IncrementBy, IVType);
8428+ Cast->insertAfter (IncrementBy->getDefiningRecipe ());
8429+ IncrementBy = Cast;
8430+ }
8431+ }
8432+
83938433 auto *CanonicalIVIncrement = Builder.createOverflowingOp (
8394- Instruction::Add, {CanonicalIVPHI, &Plan. getVFxUF () }, {HasNUW, false }, DL,
8434+ Instruction::Add, {CanonicalIVPHI, IncrementBy }, {HasNUW, false }, DL,
83958435 " index.next" );
83968436 CanonicalIVPHI->addOperand (CanonicalIVIncrement);
83978437
@@ -8480,8 +8520,9 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
84808520 }
84818521}
84828522
8483- VPlanPtr
8484- LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes (VFRange &Range) {
8523+ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes (
8524+ VFRange &Range, SmallVector<PointerDiffInfoValues> RTChecks,
8525+ bool &HasAliasMask) {
84858526
84868527 SmallPtrSet<const InterleaveGroup<Instruction> *, 1 > InterleaveGroups;
84878528
@@ -8520,7 +8561,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85208561 // When not folding the tail, we know that the induction increment will not
85218562 // overflow.
85228563 bool HasNUW = Style == TailFoldingStyle::None;
8523- addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL);
8564+
8565+ VPValue *AliasMask = nullptr ;
8566+ if (useActiveLaneMask (Style)) {
8567+ // Create an alias mask for each possibly-aliasing pointer pair. If there
8568+ // are multiple they are combined together with ANDs.
8569+ VPRegionBlock *TopRegion = Plan->getVectorLoopRegion ();
8570+ auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor ());
8571+ VPBuilder Builder (VecPreheader);
8572+ for (auto C : RTChecks) {
8573+ HasAliasMask = true ;
8574+ VPValue *Sink = Plan->getOrAddLiveIn (C.Sink );
8575+ VPValue *Src = Plan->getOrAddLiveIn (C.Src );
8576+ VPValue *M =
8577+ Builder.createNaryOp (VPInstruction::AliasLaneMask, {Sink, Src}, DL,
8578+ " active.lane.mask.alias" );
8579+ if (AliasMask)
8580+ AliasMask = Builder.createAnd (AliasMask, M);
8581+ else
8582+ AliasMask = M;
8583+ }
8584+ }
8585+ addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW, DL,
8586+ AliasMask);
85248587
85258588 VPRecipeBuilder RecipeBuilder (*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
85268589
@@ -8737,7 +8800,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
87378800 bool WithoutRuntimeCheck =
87388801 Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
87398802 VPlanTransforms::addActiveLaneMask (*Plan, ForControlFlow,
8740- WithoutRuntimeCheck);
8803+ WithoutRuntimeCheck, AliasMask );
87418804 }
87428805 return Plan;
87438806}
@@ -8777,7 +8840,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
87778840 // is guaranteed to not wrap.
87788841 bool HasNUW = true ;
87798842 addCanonicalIVRecipes (*Plan, Legal->getWidestInductionType (), HasNUW,
8780- DebugLoc ());
8843+ DebugLoc (), nullptr );
87818844 assert (verifyVPlanIsValid (*Plan) && " VPlan is invalid" );
87828845 return Plan;
87838846}
@@ -9516,6 +9579,7 @@ static bool processLoopInVPlanNativePath(
95169579 // Mark the loop as already vectorized to avoid vectorizing again.
95179580 Hints.setAlreadyVectorized ();
95189581 assert (!verifyFunction (*L->getHeader ()->getParent (), &dbgs ()));
9582+
95199583 return true ;
95209584}
95219585
@@ -9838,16 +9902,33 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98389902 ElementCount UserVF = Hints.getWidth ();
98399903 unsigned UserIC = Hints.getInterleave ();
98409904
9905+ bool AddBranchWeights =
9906+ hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
9907+ GeneratedRTChecks Checks (*PSE.getSE (), DT, LI, TTI, F->getDataLayout (),
9908+ AddBranchWeights);
9909+
9910+ // VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
9911+ // here and put them into a list.
9912+ std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
9913+ LVL.getLAI ()->getRuntimePointerChecking ()->getDiffChecks ();
9914+ SmallVector<PointerDiffInfoValues> DiffChecksValues;
9915+ if (DiffChecks.has_value () &&
9916+ useActiveLaneMask (CM.getTailFoldingStyle (true ))) {
9917+ Instruction *Loc = L->getLoopPreheader ()->getTerminator ();
9918+ for (auto Check : *DiffChecks) {
9919+ Value *Sink = Checks.expandCodeForMemCheck (Check.SinkStart , Loc);
9920+ Value *Src = Checks.expandCodeForMemCheck (Check.SrcStart , Loc);
9921+ DiffChecksValues.push_back (PointerDiffInfoValues (Src, Sink));
9922+ }
9923+ }
9924+
98419925 // Plan how to best vectorize, return the best VF and its cost.
9842- std::optional<VectorizationFactor> MaybeVF = LVP.plan (UserVF, UserIC);
9926+ std::optional<VectorizationFactor> MaybeVF =
9927+ LVP.plan (UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask );
98439928
98449929 VectorizationFactor VF = VectorizationFactor::Disabled ();
98459930 unsigned IC = 1 ;
98469931
9847- bool AddBranchWeights =
9848- hasBranchWeightMD (*L->getLoopLatch ()->getTerminator ());
9849- GeneratedRTChecks Checks (*PSE.getSE (), DT, LI, TTI,
9850- F->getDataLayout (), AddBranchWeights);
98519932 if (MaybeVF) {
98529933 VF = *MaybeVF;
98539934 // Select the interleave count.
0 commit comments