Skip to content

Commit 96b8b65

Browse files
committed
[LV] Mask off possibly aliasing vector lanes
When vectorising a loop that uses loads and stores, those pointers could overlap if their difference is less than the vector factor. For example, if address 20 is being stored to and address 23 is being loaded from, they overlap when the vector factor is 4 or higher. Currently LoopVectorize branches to a scalar loop in these cases with a runtime check. Howver if we construct a mask that disables the overlapping (aliasing) lanes then the vectorised loop can be safely entered, as long as the loads and stores are masked off.
1 parent d4630ae commit 96b8b65

File tree

13 files changed

+443
-89
lines changed

13 files changed

+443
-89
lines changed

llvm/include/llvm/Analysis/TargetTransformInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,13 @@ enum class TailFoldingStyle {
194194
DataWithEVL,
195195
};
196196

197+
enum class RTCheckStyle {
198+
/// Branch to scalar loop if checks fails at runtime.
199+
ScalarFallback,
200+
/// Form a mask based on elements which won't be a WAR or RAW hazard
201+
UseSafeEltsMask,
202+
};
203+
197204
struct TailFoldingInfo {
198205
TargetLibraryInfo *TLI;
199206
LoopVectorizationLegality *LVL;

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -418,7 +418,13 @@ class LoopVectorizationPlanner {
418418
/// Build VPlans for the specified \p UserVF and \p UserIC if they are
419419
/// non-zero or all applicable candidate VFs otherwise. If vectorization and
420420
/// interleaving should be avoided up-front, no plans are generated.
421-
void plan(ElementCount UserVF, unsigned UserIC);
421+
/// RTChecks is a list of pointer pairs that should be checked for aliasing,
422+
/// setting HasAliasMask to true in the case that an alias mask is generated
423+
/// and the vector loop should be entered even if the pointers alias across a
424+
/// loop iteration.
425+
void plan(ElementCount UserVF, unsigned UserIC,
426+
std::optional<ArrayRef<PointerDiffInfo>> DiffChecks,
427+
bool &HasAliasMask);
422428

423429
/// Use the VPlan-native path to plan how to best vectorize, return the best
424430
/// VF and its cost.
@@ -495,12 +501,22 @@ class LoopVectorizationPlanner {
495501
/// returned VPlan is valid for. If no VPlan can be built for the input range,
496502
/// set the largest included VF to the maximum VF for which no plan could be
497503
/// built.
498-
VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
504+
/// RTChecks is a list of pointer pairs that should be checked for aliasing,
505+
/// setting HasAliasMask to true in the case that an alias mask is generated
506+
/// and the vector loop should be entered even if the pointers alias across a
507+
/// loop iteration.
508+
VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range,
509+
ArrayRef<PointerDiffInfo> RTChecks,
510+
bool &HasAliasMask);
499511

500512
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
501513
/// according to the information gathered by Legal when it checked if it is
502514
/// legal to vectorize the loop. This method creates VPlans using VPRecipes.
503-
void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
515+
/// RTChecks contains a list of pointer pairs that an alias mask should be
516+
/// generated for.
517+
void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
518+
ArrayRef<PointerDiffInfo> RTChecks,
519+
bool &HasAliasMask);
504520

505521
// Adjust the recipes for reductions. For in-loop reductions the chain of
506522
// instructions leading from the loop exit instr to the phi need to be

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 62 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,7 @@ const char LLVMLoopVectorizeFollowupEpilogue[] =
173173
STATISTIC(LoopsVectorized, "Number of loops vectorized");
174174
STATISTIC(LoopsAnalyzed, "Number of loops analyzed for vectorization");
175175
STATISTIC(LoopsEpilogueVectorized, "Number of epilogues vectorized");
176+
STATISTIC(LoopsAliasMasked, "Number of loops predicated with an alias mask");
176177

177178
static cl::opt<bool> EnableEpilogueVectorization(
178179
"enable-epilogue-vectorization", cl::init(true), cl::Hidden,
@@ -1806,6 +1807,10 @@ class GeneratedRTChecks {
18061807
PredicatedScalarEvolution &PSE;
18071808

18081809
public:
1810+
/// Set by VPlan when the vector loop should be entered even when runtime
1811+
/// checks determine that pointers alias within an iteration.
1812+
bool HasAliasMask = false;
1813+
18091814
GeneratedRTChecks(PredicatedScalarEvolution &PSE, DominatorTree *DT,
18101815
LoopInfo *LI, TargetTransformInfo *TTI,
18111816
const DataLayout &DL, bool AddBranchWeights)
@@ -1847,9 +1852,11 @@ class GeneratedRTChecks {
18471852

18481853
const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
18491854
if (RtPtrChecking.Need) {
1850-
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1851-
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1852-
"vector.memcheck");
1855+
if (!MemCheckBlock) {
1856+
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1857+
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1858+
"vector.memcheck");
1859+
}
18531860

18541861
auto DiffChecks = RtPtrChecking.getDiffChecks();
18551862
if (DiffChecks) {
@@ -2077,11 +2084,18 @@ class GeneratedRTChecks {
20772084
if (OuterLoop)
20782085
OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
20792086

2080-
BranchInst &BI =
2081-
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2082-
if (AddBranchWeights) {
2087+
// TODO: Branch to the vector preheader conditionally based on the number of
2088+
// non-aliasing elements. The scalar loop will likely be better if only one
2089+
// or two elements will be processed per vectorised loop iteration.
2090+
2091+
// Jump to the vector preheader unconditionally if it's safe to do so
2092+
// because an alias mask has been set up.
2093+
BranchInst &BI = HasAliasMask
2094+
? *BranchInst::Create(LoopVectorPreHeader)
2095+
: *BranchInst::Create(Bypass, LoopVectorPreHeader,
2096+
MemRuntimeCheckCond);
2097+
if (!HasAliasMask && AddBranchWeights)
20832098
setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2084-
}
20852099
ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
20862100
MemCheckBlock->getTerminator()->setDebugLoc(
20872101
Pred->getTerminator()->getDebugLoc());
@@ -2564,7 +2578,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
25642578
});
25652579
}
25662580

2567-
LoopBypassBlocks.push_back(MemCheckBlock);
2581+
/// If an alias mask has been set up then we don't need the bypass as the
2582+
/// vector preheader will be branched to unconditionally
2583+
if (!RTChecks.HasAliasMask)
2584+
LoopBypassBlocks.push_back(MemCheckBlock);
25682585

25692586
AddedSafetyChecks = true;
25702587

@@ -7097,7 +7114,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
70977114
return VectorizationFactor::Disabled();
70987115
}
70997116

7100-
void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
7117+
void LoopVectorizationPlanner::plan(
7118+
ElementCount UserVF, unsigned UserIC,
7119+
std::optional<ArrayRef<PointerDiffInfo>> RTChecks, bool &HasAliasMask) {
71017120
assert(OrigLoop->isInnermost() && "Inner loop expected.");
71027121
CM.collectValuesToIgnore();
71037122
CM.collectElementTypesForWidening();
@@ -7106,6 +7125,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71067125
if (!MaxFactors) // Cases that should not to be vectorized nor interleaved.
71077126
return;
71087127

7128+
ArrayRef<PointerDiffInfo> DiffChecks;
7129+
if (RTChecks.has_value() && useActiveLaneMask(CM.getTailFoldingStyle(true)))
7130+
DiffChecks = *RTChecks;
7131+
71097132
// Invalidate interleave groups if all blocks of loop will be predicated.
71107133
if (CM.blockNeedsPredicationForAnyReason(OrigLoop->getHeader()) &&
71117134
!useMaskedInterleavedAccesses(TTI)) {
@@ -7138,7 +7161,7 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71387161
CM.collectInLoopReductions();
71397162
if (CM.selectUserVectorizationFactor(UserVF)) {
71407163
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
7141-
buildVPlansWithVPRecipes(UserVF, UserVF);
7164+
buildVPlansWithVPRecipes(UserVF, UserVF, DiffChecks, HasAliasMask);
71427165
LLVM_DEBUG(printPlans(dbgs()));
71437166
return;
71447167
}
@@ -7167,8 +7190,10 @@ void LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
71677190
CM.collectInstsToScalarize(VF);
71687191
}
71697192

7170-
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
7171-
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
7193+
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
7194+
DiffChecks, HasAliasMask);
7195+
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
7196+
DiffChecks, HasAliasMask);
71727197

71737198
LLVM_DEBUG(printPlans(dbgs()));
71747199
}
@@ -7690,7 +7715,6 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
76907715
CanonicalIVStartValue, State);
76917716

76927717
BestVPlan.execute(&State);
7693-
76947718
// 2.5 Collect reduction resume values.
76957719
auto *ExitVPBB =
76967720
cast<VPBasicBlock>(BestVPlan.getVectorLoopRegion()->getSingleSuccessor());
@@ -7923,7 +7947,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
79237947
// reduction phis in the scalar loop preheader.
79247948
if (EPI.SCEVSafetyCheck)
79257949
LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7926-
if (EPI.MemSafetyCheck)
7950+
if (EPI.MemSafetyCheck && !RTChecks.HasAliasMask)
79277951
LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
79287952
LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
79297953

@@ -8179,9 +8203,8 @@ void VPRecipeBuilder::createHeaderMask() {
81798203

81808204
VPBuilder::InsertPointGuard Guard(Builder);
81818205
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
8182-
VPValue *BlockMask = nullptr;
81838206
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
8184-
BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
8207+
VPValue *BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
81858208
BlockMaskCache[Header] = BlockMask;
81868209
}
81878210

@@ -8720,14 +8743,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
87208743
return tryToWiden(Instr, Operands, VPBB);
87218744
}
87228745

8723-
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8724-
ElementCount MaxVF) {
8746+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
8747+
ElementCount MinVF, ElementCount MaxVF, ArrayRef<PointerDiffInfo> RTChecks,
8748+
bool &HasAliasMask) {
87258749
assert(OrigLoop->isInnermost() && "Inner loop expected.");
87268750

87278751
auto MaxVFTimes2 = MaxVF * 2;
87288752
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
87298753
VFRange SubRange = {VF, MaxVFTimes2};
8730-
if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8754+
if (auto Plan =
8755+
tryToBuildVPlanWithVPRecipes(SubRange, RTChecks, HasAliasMask)) {
87318756
// Now optimize the initial VPlan.
87328757
if (!Plan->hasVF(ElementCount::getFixed(1)))
87338758
VPlanTransforms::truncateToMinimalBitwidths(*Plan,
@@ -8760,6 +8785,8 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
87608785

87618786
VPBuilder Builder(TopRegion->getExitingBasicBlock());
87628787
// Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8788+
// If an alias mask is present, this will be replaced by an increment of the
8789+
// mask's popcount.
87638790
auto *CanonicalIVIncrement = Builder.createOverflowingOp(
87648791
Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
87658792
"index.next");
@@ -8978,8 +9005,8 @@ static void addLiveOutsForFirstOrderRecurrences(
89789005
}
89799006
}
89809007

8981-
VPlanPtr
8982-
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
9008+
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
9009+
VFRange &Range, ArrayRef<PointerDiffInfo> RTChecks, bool &HasAliasMask) {
89839010

89849011
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
89859012

@@ -9215,7 +9242,9 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
92159242
bool WithoutRuntimeCheck =
92169243
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
92179244
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
9218-
WithoutRuntimeCheck);
9245+
WithoutRuntimeCheck, PSE, RTChecks);
9246+
if (ForControlFlow && !RTChecks.empty())
9247+
HasAliasMask = true;
92199248
}
92209249
return Plan;
92219250
}
@@ -9699,6 +9728,7 @@ static bool processLoopInVPlanNativePath(
96999728
// Mark the loop as already vectorized to avoid vectorizing again.
97009729
Hints.setAlreadyVectorized();
97019730
assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9731+
97029732
return true;
97039733
}
97049734

@@ -10030,18 +10060,23 @@ bool LoopVectorizePass::processLoop(Loop *L) {
1003010060
ElementCount UserVF = Hints.getWidth();
1003110061
unsigned UserIC = Hints.getInterleave();
1003210062

10063+
bool AddBranchWeights =
10064+
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10065+
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10066+
AddBranchWeights);
10067+
1003310068
// Plan how to best vectorize.
10034-
LVP.plan(UserVF, UserIC);
10069+
LVP.plan(UserVF, UserIC,
10070+
LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks(),
10071+
Checks.HasAliasMask);
1003510072
VectorizationFactor VF = LVP.computeBestVF();
10073+
if (Checks.HasAliasMask)
10074+
LoopsAliasMasked++;
1003610075
unsigned IC = 1;
1003710076

1003810077
if (ORE->allowExtraAnalysis(LV_NAME))
1003910078
LVP.emitInvalidCostRemarks(ORE);
1004010079

10041-
bool AddBranchWeights =
10042-
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
10043-
GeneratedRTChecks Checks(PSE, DT, LI, TTI, F->getDataLayout(),
10044-
AddBranchWeights);
1004510080
if (LVP.hasPlanWithVF(VF.Width)) {
1004610081
// Select the interleave count.
1004710082
IC = CM.selectInterleaveCount(VF.Width, VF.Cost);

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -962,7 +962,6 @@ void VPlan::prepareToExecute(Value *TripCountV, Value *VectorTripCountV,
962962

963963
IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
964964
// FIXME: Model VF * UF computation completely in VPlan.
965-
assert(VFxUF.getNumUsers() && "VFxUF expected to always have users");
966965
unsigned UF = getUF();
967966
if (VF.getNumUsers()) {
968967
Value *RuntimeVF = getRuntimeVF(Builder, TCTy, State.VF);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -895,6 +895,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
895895
switch (R->getVPDefID()) {
896896
case VPRecipeBase::VPDerivedIVSC:
897897
case VPRecipeBase::VPEVLBasedIVPHISC:
898+
case VPRecipeBase::VPAliasLaneMaskSC:
898899
case VPRecipeBase::VPExpandSCEVSC:
899900
case VPRecipeBase::VPInstructionSC:
900901
case VPRecipeBase::VPReductionEVLSC:
@@ -1270,6 +1271,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
12701271
// last. The second operand must be a positive constant and <= VF.
12711272
ExtractFromEnd,
12721273
LogicalAnd, // Non-poison propagating logical And.
1274+
PopCount,
12731275
// Add an offset in bytes (second operand) to a base pointer (first
12741276
// operand). Only generates scalar values (either for the first lane only or
12751277
// for all lanes, depending on its uses).
@@ -2993,6 +2995,52 @@ struct VPWidenStoreEVLRecipe final : public VPWidenMemoryRecipe {
29932995
}
29942996
};
29952997

2998+
// Given a pointer A that is being stored to, and pointer B that is being
2999+
// read from, both with unknown lengths, create a mask that disables
3000+
// elements which could overlap across a loop iteration. For example, if A
3001+
// is X and B is X + 2 with VF being 4, only the final two elements of the
3002+
// loaded vector can be stored since they don't overlap with the stored
3003+
// vector. %b.vec = load %b ; = [s, t, u, v]
3004+
// [...]
3005+
// store %a, %b.vec ; only u and v can be stored as their addresses don't
3006+
// overlap with %a + (VF - 1)
3007+
class VPAliasLaneMaskRecipe : public VPSingleDefRecipe {
3008+
3009+
public:
3010+
VPAliasLaneMaskRecipe(VPValue *Src, VPValue *Sink, unsigned ElementSize)
3011+
: VPSingleDefRecipe(VPDef::VPAliasLaneMaskSC, {Src, Sink}),
3012+
ElementSize(ElementSize) {}
3013+
3014+
~VPAliasLaneMaskRecipe() override = default;
3015+
3016+
VPAliasLaneMaskRecipe *clone() override {
3017+
return new VPAliasLaneMaskRecipe(getSourceValue(), getSinkValue(),
3018+
ElementSize);
3019+
}
3020+
3021+
VP_CLASSOF_IMPL(VPDef::VPAliasLaneMaskSC);
3022+
3023+
void execute(VPTransformState &State) override;
3024+
3025+
/// Get the VPValue* for the pointer being read from
3026+
VPValue *getSourceValue() const { return getOperand(0); }
3027+
3028+
// Get the size of the element(s) accessed by the pointers
3029+
unsigned getAccessedElementSize() const { return ElementSize; }
3030+
3031+
/// Get the VPValue* for the pointer being stored to
3032+
VPValue *getSinkValue() const { return getOperand(1); }
3033+
3034+
private:
3035+
unsigned ElementSize;
3036+
3037+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3038+
/// Print the recipe.
3039+
void print(raw_ostream &O, const Twine &Indent,
3040+
VPSlotTracker &SlotTracker) const override;
3041+
#endif
3042+
};
3043+
29963044
/// Recipe to expand a SCEV expression.
29973045
class VPExpandSCEVRecipe : public VPSingleDefRecipe {
29983046
const SCEV *Expr;

0 commit comments

Comments
 (0)