Skip to content

Commit 5019914

Browse files
committed
[LV] Mask off possibly aliasing vector lanes
When vectorising a loop that uses loads and stores, those pointers could overlap if their difference is less than the vector factor. For example, if address 20 is being stored to and address 23 is being loaded from, they overlap when the vector factor is 4 or higher. Currently LoopVectorize branches to a scalar loop in these cases with a runtime check. Howver if we construct a mask that disables the overlapping (aliasing) lanes then the vectorised loop can be safely entered, as long as the loads and stores are masked off. This PR modifies the LoopVectorizer and VPlan to create such a mask and always branch to the vector loop. Currently this is only done if we're tail-predicating, but more work will come in the future to do this in other cases as well.
1 parent c7a3346 commit 5019914

File tree

11 files changed

+1102
-107
lines changed

11 files changed

+1102
-107
lines changed

clang/test/CodeGen/loop-alias-mask.c

Lines changed: 404 additions & 0 deletions
Large diffs are not rendered by default.

llvm/include/llvm/Analysis/LoopAccessAnalysis.h

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@
1616

1717
#include "llvm/ADT/EquivalenceClasses.h"
1818
#include "llvm/Analysis/LoopAnalysisManager.h"
19-
#include "llvm/Analysis/ScalarEvolutionExpressions.h"
2019
#include "llvm/IR/DiagnosticInfo.h"
20+
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
2121
#include <optional>
2222
#include <variant>
2323

@@ -448,6 +448,20 @@ struct PointerDiffInfo {
448448
NeedsFreeze(NeedsFreeze) {}
449449
};
450450

451+
/// A pair of pointers that could overlap across a loop iteration.
452+
struct PointerDiffInfoValues {
453+
/// The pointer being read from
454+
Value *Src;
455+
/// The pointer being stored to
456+
Value *Sink;
457+
458+
PointerDiffInfoValues(const SCEV *SrcStart, const SCEV *SinkStart,
459+
SCEVExpander Exp, Instruction *Loc)
460+
: Src(Exp.expandCodeFor(SrcStart, SrcStart->getType(), Loc)),
461+
Sink(Exp.expandCodeFor(SinkStart, SinkStart->getType(), Loc)) {}
462+
PointerDiffInfoValues(Value *Src, Value *Sink) : Src(Src), Sink(Sink) {}
463+
};
464+
451465
/// Holds information about the memory runtime legality checks to verify
452466
/// that a group of pointers do not overlap.
453467
class RuntimePointerChecking {

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 21 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@
2626

2727
#include "VPlan.h"
2828
#include "llvm/ADT/SmallSet.h"
29+
#include "llvm/Analysis/LoopAccessAnalysis.h"
2930
#include "llvm/Support/InstructionCost.h"
3031

3132
namespace llvm {
@@ -356,7 +357,13 @@ class LoopVectorizationPlanner {
356357

357358
/// Plan how to best vectorize, return the best VF and its cost, or
358359
/// std::nullopt if vectorization and interleaving should be avoided up front.
359-
std::optional<VectorizationFactor> plan(ElementCount UserVF, unsigned UserIC);
360+
/// RTChecks is a list of pointer pairs that should be checked for aliasing,
361+
/// setting HasAliasMask to true in the case that an alias mask is generated
362+
/// and the vector loop should be entered even if the pointers alias across a
363+
/// loop iteration.
364+
std::optional<VectorizationFactor>
365+
plan(ElementCount UserVF, unsigned UserIC,
366+
SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask);
360367

361368
/// Use the VPlan-native path to plan how to best vectorize, return the best
362369
/// VF and its cost.
@@ -429,12 +436,23 @@ class LoopVectorizationPlanner {
429436
/// returned VPlan is valid for. If no VPlan can be built for the input range,
430437
/// set the largest included VF to the maximum VF for which no plan could be
431438
/// built.
432-
VPlanPtr tryToBuildVPlanWithVPRecipes(VFRange &Range);
439+
/// RTChecks is a list of pointer pairs that should be checked for aliasing,
440+
/// setting HasAliasMask to true in the case that an alias mask is generated
441+
/// and the vector loop should be entered even if the pointers alias across a
442+
/// loop iteration.
443+
VPlanPtr
444+
tryToBuildVPlanWithVPRecipes(VFRange &Range,
445+
SmallVector<PointerDiffInfoValues> RTChecks,
446+
bool &HasAliasMask);
433447

434448
/// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
435449
/// according to the information gathered by Legal when it checked if it is
436450
/// legal to vectorize the loop. This method creates VPlans using VPRecipes.
437-
void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF);
451+
/// RTChecks contains a list of pointer pairs that an alias mask should be
452+
/// generated for.
453+
void buildVPlansWithVPRecipes(ElementCount MinVF, ElementCount MaxVF,
454+
SmallVector<PointerDiffInfoValues> RTChecks,
455+
bool &HasAliasMask);
438456

439457
// Adjust the recipes for reductions. For in-loop reductions the chain of
440458
// instructions leading from the loop exit instr to the phi need to be

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 112 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -1829,6 +1829,10 @@ class GeneratedRTChecks {
18291829
Loop *OuterLoop = nullptr;
18301830

18311831
public:
1832+
/// Set by VPlan when the vector loop should be entered even when runtime
1833+
/// checks determine that pointers alias within an iteration.
1834+
bool HasAliasMask = false;
1835+
18321836
GeneratedRTChecks(ScalarEvolution &SE, DominatorTree *DT, LoopInfo *LI,
18331837
TargetTransformInfo *TTI, const DataLayout &DL,
18341838
bool AddBranchWeights)
@@ -1869,9 +1873,11 @@ class GeneratedRTChecks {
18691873

18701874
const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
18711875
if (RtPtrChecking.Need) {
1872-
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1873-
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1874-
"vector.memcheck");
1876+
if (!MemCheckBlock) {
1877+
auto *Pred = SCEVCheckBlock ? SCEVCheckBlock : Preheader;
1878+
MemCheckBlock = SplitBlock(Pred, Pred->getTerminator(), DT, LI, nullptr,
1879+
"vector.memcheck");
1880+
}
18751881

18761882
auto DiffChecks = RtPtrChecking.getDiffChecks();
18771883
if (DiffChecks) {
@@ -1929,6 +1935,10 @@ class GeneratedRTChecks {
19291935
OuterLoop = L->getParentLoop();
19301936
}
19311937

1938+
Value *expandCodeForMemCheck(const SCEV *Scev, Instruction *Loc) {
1939+
return MemCheckExp.expandCodeFor(Scev, Scev->getType(), Loc);
1940+
}
1941+
19321942
InstructionCost getCost() {
19331943
if (SCEVCheckBlock || MemCheckBlock)
19341944
LLVM_DEBUG(dbgs() << "Calculating cost of runtime checks:\n");
@@ -2103,11 +2113,18 @@ class GeneratedRTChecks {
21032113
if (OuterLoop)
21042114
OuterLoop->addBasicBlockToLoop(MemCheckBlock, *LI);
21052115

2106-
BranchInst &BI =
2107-
*BranchInst::Create(Bypass, LoopVectorPreHeader, MemRuntimeCheckCond);
2108-
if (AddBranchWeights) {
2116+
// TODO: Branch to the vector preheader conditionally based on the number of
2117+
// non-aliasing elements. The scalar loop will likely be better if only one
2118+
// or two elements will be processed per vectorised loop iteration.
2119+
2120+
// Jump to the vector preheader unconditionally if it's safe to do so
2121+
// because an alias mask has been set up.
2122+
BranchInst &BI = HasAliasMask
2123+
? *BranchInst::Create(LoopVectorPreHeader)
2124+
: *BranchInst::Create(Bypass, LoopVectorPreHeader,
2125+
MemRuntimeCheckCond);
2126+
if (!HasAliasMask && AddBranchWeights)
21092127
setBranchWeights(BI, MemCheckBypassWeights, /*IsExpected=*/false);
2110-
}
21112128
ReplaceInstWithInst(MemCheckBlock->getTerminator(), &BI);
21122129
MemCheckBlock->getTerminator()->setDebugLoc(
21132130
Pred->getTerminator()->getDebugLoc());
@@ -2576,7 +2593,10 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
25762593
});
25772594
}
25782595

2579-
LoopBypassBlocks.push_back(MemCheckBlock);
2596+
/// If an alias mask has been set up then we don't need the bypass as the
2597+
/// vector preheader will be branched to unconditionally
2598+
if (!RTChecks.HasAliasMask)
2599+
LoopBypassBlocks.push_back(MemCheckBlock);
25802600

25812601
AddedSafetyChecks = true;
25822602

@@ -6885,7 +6905,9 @@ LoopVectorizationPlanner::planInVPlanNativePath(ElementCount UserVF) {
68856905
}
68866906

68876907
std::optional<VectorizationFactor>
6888-
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
6908+
LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC,
6909+
SmallVector<PointerDiffInfoValues> RTChecks,
6910+
bool &HasAliasMask) {
68896911
assert(OrigLoop->isInnermost() && "Inner loop expected.");
68906912
CM.collectValuesToIgnore();
68916913
CM.collectElementTypesForWidening();
@@ -6922,7 +6944,7 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
69226944
CM.collectInLoopReductions();
69236945
if (CM.selectUserVectorizationFactor(UserVF)) {
69246946
LLVM_DEBUG(dbgs() << "LV: Using user VF " << UserVF << ".\n");
6925-
buildVPlansWithVPRecipes(UserVF, UserVF);
6947+
buildVPlansWithVPRecipes(UserVF, UserVF, RTChecks, HasAliasMask);
69266948
if (!hasPlanWithVF(UserVF)) {
69276949
LLVM_DEBUG(dbgs() << "LV: No VPlan could be built for " << UserVF
69286950
<< ".\n");
@@ -6956,8 +6978,10 @@ LoopVectorizationPlanner::plan(ElementCount UserVF, unsigned UserIC) {
69566978
CM.collectInstsToScalarize(VF);
69576979
}
69586980

6959-
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF);
6960-
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF);
6981+
buildVPlansWithVPRecipes(ElementCount::getFixed(1), MaxFactors.FixedVF,
6982+
RTChecks, HasAliasMask);
6983+
buildVPlansWithVPRecipes(ElementCount::getScalable(1), MaxFactors.ScalableVF,
6984+
RTChecks, HasAliasMask);
69616985

69626986
LLVM_DEBUG(printPlans(dbgs()));
69636987
if (VPlans.empty())
@@ -7383,7 +7407,6 @@ LoopVectorizationPlanner::executePlan(
73837407
CanonicalIVStartValue, State);
73847408

73857409
BestVPlan.execute(&State);
7386-
73877410
// 2.5 Collect reduction resume values.
73887411
DenseMap<const RecurrenceDescriptor *, Value *> ReductionResumeValues;
73897412
auto *ExitVPBB =
@@ -7627,7 +7650,7 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton(
76277650
// reduction phis in the scalar loop preheader.
76287651
if (EPI.SCEVSafetyCheck)
76297652
LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck);
7630-
if (EPI.MemSafetyCheck)
7653+
if (EPI.MemSafetyCheck && !RTChecks.HasAliasMask)
76317654
LoopBypassBlocks.push_back(EPI.MemSafetyCheck);
76327655
LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck);
76337656

@@ -7848,14 +7871,14 @@ void VPRecipeBuilder::createHeaderMask() {
78487871
// constructing the desired canonical IV in the header block as its first
78497872
// non-phi instructions.
78507873

7874+
VPValue *BlockMask = nullptr;
78517875
VPBasicBlock *HeaderVPBB = Plan.getVectorLoopRegion()->getEntryBasicBlock();
78527876
auto NewInsertionPoint = HeaderVPBB->getFirstNonPhi();
78537877
auto *IV = new VPWidenCanonicalIVRecipe(Plan.getCanonicalIV());
78547878
HeaderVPBB->insert(IV, NewInsertionPoint);
78557879

78567880
VPBuilder::InsertPointGuard Guard(Builder);
78577881
Builder.setInsertPoint(HeaderVPBB, NewInsertionPoint);
7858-
VPValue *BlockMask = nullptr;
78597882
VPValue *BTC = Plan.getOrCreateBackedgeTakenCount();
78607883
BlockMask = Builder.createICmp(CmpInst::ICMP_ULE, IV, BTC);
78617884
BlockMaskCache[Header] = BlockMask;
@@ -8350,14 +8373,16 @@ VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
83508373
return tryToWiden(Instr, Operands, VPBB);
83518374
}
83528375

8353-
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
8354-
ElementCount MaxVF) {
8376+
void LoopVectorizationPlanner::buildVPlansWithVPRecipes(
8377+
ElementCount MinVF, ElementCount MaxVF,
8378+
SmallVector<PointerDiffInfoValues> RTChecks, bool &HasAliasMask) {
83558379
assert(OrigLoop->isInnermost() && "Inner loop expected.");
83568380

83578381
auto MaxVFTimes2 = MaxVF * 2;
83588382
for (ElementCount VF = MinVF; ElementCount::isKnownLT(VF, MaxVFTimes2);) {
83598383
VFRange SubRange = {VF, MaxVFTimes2};
8360-
if (auto Plan = tryToBuildVPlanWithVPRecipes(SubRange)) {
8384+
if (auto Plan =
8385+
tryToBuildVPlanWithVPRecipes(SubRange, RTChecks, HasAliasMask)) {
83618386
// Now optimize the initial VPlan.
83628387
if (!Plan->hasVF(ElementCount::getFixed(1)))
83638388
VPlanTransforms::truncateToMinimalBitwidths(
@@ -8378,7 +8403,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
83788403
// Add the necessary canonical IV and branch recipes required to control the
83798404
// loop.
83808405
static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
8381-
DebugLoc DL) {
8406+
DebugLoc DL, VPValue *AliasMask) {
83828407
Value *StartIdx = ConstantInt::get(IdxTy, 0);
83838408
auto *StartV = Plan.getOrAddLiveIn(StartIdx);
83848409

@@ -8389,9 +8414,24 @@ static void addCanonicalIVRecipes(VPlan &Plan, Type *IdxTy, bool HasNUW,
83898414
Header->insert(CanonicalIVPHI, Header->begin());
83908415

83918416
VPBuilder Builder(TopRegion->getExitingBasicBlock());
8392-
// Add a VPInstruction to increment the scalar canonical IV by VF * UF.
8417+
// Add a VPInstruction to increment the scalar canonical IV by VF * UF, or the
8418+
// popcount of the alias mask if there is one
8419+
VPValue *IncrementBy = &Plan.getVFxUF();
8420+
if (AliasMask) {
8421+
IncrementBy = Builder.createNaryOp(VPInstruction::PopCount, {AliasMask}, DL,
8422+
"popcount");
8423+
auto *IVType = CanonicalIVPHI->getScalarType();
8424+
8425+
if (IVType->getScalarSizeInBits() < 64) {
8426+
auto *Cast =
8427+
new VPScalarCastRecipe(Instruction::Trunc, IncrementBy, IVType);
8428+
Cast->insertAfter(IncrementBy->getDefiningRecipe());
8429+
IncrementBy = Cast;
8430+
}
8431+
}
8432+
83938433
auto *CanonicalIVIncrement = Builder.createOverflowingOp(
8394-
Instruction::Add, {CanonicalIVPHI, &Plan.getVFxUF()}, {HasNUW, false}, DL,
8434+
Instruction::Add, {CanonicalIVPHI, IncrementBy}, {HasNUW, false}, DL,
83958435
"index.next");
83968436
CanonicalIVPHI->addOperand(CanonicalIVIncrement);
83978437

@@ -8480,8 +8520,9 @@ static void addLiveOutsForFirstOrderRecurrences(VPlan &Plan) {
84808520
}
84818521
}
84828522

8483-
VPlanPtr
8484-
LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
8523+
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
8524+
VFRange &Range, SmallVector<PointerDiffInfoValues> RTChecks,
8525+
bool &HasAliasMask) {
84858526

84868527
SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
84878528

@@ -8520,7 +8561,29 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
85208561
// When not folding the tail, we know that the induction increment will not
85218562
// overflow.
85228563
bool HasNUW = Style == TailFoldingStyle::None;
8523-
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL);
8564+
8565+
VPValue *AliasMask = nullptr;
8566+
if (useActiveLaneMask(Style)) {
8567+
// Create an alias mask for each possibly-aliasing pointer pair. If there
8568+
// are multiple they are combined together with ANDs.
8569+
VPRegionBlock *TopRegion = Plan->getVectorLoopRegion();
8570+
auto *VecPreheader = cast<VPBasicBlock>(TopRegion->getSinglePredecessor());
8571+
VPBuilder Builder(VecPreheader);
8572+
for (auto C : RTChecks) {
8573+
HasAliasMask = true;
8574+
VPValue *Sink = Plan->getOrAddLiveIn(C.Sink);
8575+
VPValue *Src = Plan->getOrAddLiveIn(C.Src);
8576+
VPValue *M =
8577+
Builder.createNaryOp(VPInstruction::AliasLaneMask, {Sink, Src}, DL,
8578+
"active.lane.mask.alias");
8579+
if (AliasMask)
8580+
AliasMask = Builder.createAnd(AliasMask, M);
8581+
else
8582+
AliasMask = M;
8583+
}
8584+
}
8585+
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW, DL,
8586+
AliasMask);
85248587

85258588
VPRecipeBuilder RecipeBuilder(*Plan, OrigLoop, TLI, Legal, CM, PSE, Builder);
85268589

@@ -8737,7 +8800,7 @@ LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(VFRange &Range) {
87378800
bool WithoutRuntimeCheck =
87388801
Style == TailFoldingStyle::DataAndControlFlowWithoutRuntimeCheck;
87398802
VPlanTransforms::addActiveLaneMask(*Plan, ForControlFlow,
8740-
WithoutRuntimeCheck);
8803+
WithoutRuntimeCheck, AliasMask);
87418804
}
87428805
return Plan;
87438806
}
@@ -8777,7 +8840,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlan(VFRange &Range) {
87778840
// is guaranteed to not wrap.
87788841
bool HasNUW = true;
87798842
addCanonicalIVRecipes(*Plan, Legal->getWidestInductionType(), HasNUW,
8780-
DebugLoc());
8843+
DebugLoc(), nullptr);
87818844
assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
87828845
return Plan;
87838846
}
@@ -9516,6 +9579,7 @@ static bool processLoopInVPlanNativePath(
95169579
// Mark the loop as already vectorized to avoid vectorizing again.
95179580
Hints.setAlreadyVectorized();
95189581
assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
9582+
95199583
return true;
95209584
}
95219585

@@ -9838,16 +9902,33 @@ bool LoopVectorizePass::processLoop(Loop *L) {
98389902
ElementCount UserVF = Hints.getWidth();
98399903
unsigned UserIC = Hints.getInterleave();
98409904

9905+
bool AddBranchWeights =
9906+
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9907+
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI, F->getDataLayout(),
9908+
AddBranchWeights);
9909+
9910+
// VPlan needs the aliasing pointers as Values and not SCEVs, so expand them
9911+
// here and put them into a list.
9912+
std::optional<ArrayRef<PointerDiffInfo>> DiffChecks =
9913+
LVL.getLAI()->getRuntimePointerChecking()->getDiffChecks();
9914+
SmallVector<PointerDiffInfoValues> DiffChecksValues;
9915+
if (DiffChecks.has_value() &&
9916+
useActiveLaneMask(CM.getTailFoldingStyle(true))) {
9917+
Instruction *Loc = L->getLoopPreheader()->getTerminator();
9918+
for (auto Check : *DiffChecks) {
9919+
Value *Sink = Checks.expandCodeForMemCheck(Check.SinkStart, Loc);
9920+
Value *Src = Checks.expandCodeForMemCheck(Check.SrcStart, Loc);
9921+
DiffChecksValues.push_back(PointerDiffInfoValues(Src, Sink));
9922+
}
9923+
}
9924+
98419925
// Plan how to best vectorize, return the best VF and its cost.
9842-
std::optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
9926+
std::optional<VectorizationFactor> MaybeVF =
9927+
LVP.plan(UserVF, UserIC, DiffChecksValues, Checks.HasAliasMask);
98439928

98449929
VectorizationFactor VF = VectorizationFactor::Disabled();
98459930
unsigned IC = 1;
98469931

9847-
bool AddBranchWeights =
9848-
hasBranchWeightMD(*L->getLoopLatch()->getTerminator());
9849-
GeneratedRTChecks Checks(*PSE.getSE(), DT, LI, TTI,
9850-
F->getDataLayout(), AddBranchWeights);
98519932
if (MaybeVF) {
98529933
VF = *MaybeVF;
98539934
// Select the interleave count.

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1239,6 +1239,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
12391239
SLPLoad,
12401240
SLPStore,
12411241
ActiveLaneMask,
1242+
AliasLaneMask,
12421243
ExplicitVectorLength,
12431244
/// Creates a scalar phi in a leaf VPBB with a single predecessor in VPlan.
12441245
/// The first operand is the incoming value from the predecessor in VPlan,
@@ -1258,6 +1259,7 @@ class VPInstruction : public VPRecipeWithIRFlags {
12581259
// scalar.
12591260
ExtractFromEnd,
12601261
LogicalAnd, // Non-poison propagating logical And.
1262+
PopCount,
12611263
// Add an offset in bytes (second operand) to a base pointer (first
12621264
// operand). Only generates scalar values (either for the first lane only or
12631265
// for all lanes, depending on its uses).

0 commit comments

Comments
 (0)