Skip to content

Commit 371302a

Browse files
committed
[VPlan] Hoist predicated loads with replicate recipes
This commit implements hoisting of predicated loads that are executed on both paths with complementary predicates (P and NOT P). When such loads access the same address, they can be hoisted to the loop entry as a single unpredicated load, eliminating branching overhead. Key features: - Uses SCEV to group loads by address, handling different GEP instructions that compute the same address - Checks for complementary masks (P and NOT P) - Clones address computations when needed to maintain SSA form - Hoists as unpredicated VPReplicateRecipe (no widening yet) Simp
1 parent e148d2d commit 371302a

File tree

5 files changed

+263
-369
lines changed

5 files changed

+263
-369
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8317,6 +8317,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
83178317
if (auto Plan = tryToBuildVPlanWithVPRecipes(
83188318
std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
83198319
// Now optimize the initial VPlan.
8320+
VPlanTransforms::hoistPredicatedLoads(*Plan, *PSE.getSE(), OrigLoop);
83208321
VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
83218322
*Plan, CM.getMinimalBitwidths());
83228323
VPlanTransforms::runPass(VPlanTransforms::optimize, *Plan);

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 147 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@
4242
#include "llvm/Support/TypeSize.h"
4343
#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
4444

45+
#define DEBUG_TYPE "loop-vectorize"
46+
4547
using namespace llvm;
4648
using namespace VPlanPatternMatch;
4749

@@ -3974,6 +3976,151 @@ void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
39743976
}
39753977
}
39763978

3979+
// Returns the intersection of metadata from a group of loads.
3980+
static VPIRMetadata getCommonLoadMetadata(ArrayRef<VPReplicateRecipe *> Loads) {
3981+
VPIRMetadata CommonMetadata = *Loads.front();
3982+
for (VPReplicateRecipe *Load : drop_begin(Loads))
3983+
CommonMetadata.intersect(*Load);
3984+
return CommonMetadata;
3985+
}
3986+
3987+
// Check if a load can be hoisted by verifying it doesn't alias with any stores
3988+
// in blocks between FirstBB and LastBB using scoped noalias metadata.
3989+
static bool canHoistLoadWithNoAliasCheck(VPReplicateRecipe *Load,
3990+
VPBasicBlock *FirstBB,
3991+
VPBasicBlock *LastBB) {
3992+
// Get the load's memory location and check if it aliases with any stores
3993+
// using scoped noalias metadata.
3994+
auto LoadLoc = vputils::getMemoryLocation(*Load);
3995+
if (!LoadLoc || !LoadLoc->AATags.Scope)
3996+
return false;
3997+
3998+
const AAMDNodes &LoadAA = LoadLoc->AATags;
3999+
for (VPBlockBase *Block = FirstBB; Block;
4000+
Block = Block->getSingleSuccessor()) {
4001+
// This function assumes a simple linear chain of blocks. If there are
4002+
// multiple successors, we would need more complex analysis.
4003+
assert(Block->getNumSuccessors() <= 1 &&
4004+
"Expected at most one successor in block chain");
4005+
auto *VPBB = cast<VPBasicBlock>(Block);
4006+
for (VPRecipeBase &R : *VPBB) {
4007+
if (R.mayWriteToMemory()) {
4008+
auto Loc = vputils::getMemoryLocation(R);
4009+
// Bail out if we can't get the location or if the scoped noalias
4010+
// metadata indicates potential aliasing.
4011+
if (!Loc || ScopedNoAliasAAResult::mayAliasInScopes(
4012+
LoadAA.Scope, Loc->AATags.NoAlias))
4013+
return false;
4014+
}
4015+
}
4016+
4017+
if (Block == LastBB)
4018+
break;
4019+
}
4020+
return true;
4021+
}
4022+
4023+
void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
4024+
const Loop *L) {
4025+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
4026+
VPTypeAnalysis TypeInfo(Plan);
4027+
VPDominatorTree VPDT(Plan);
4028+
4029+
// Group predicated loads by their address SCEV.
4030+
MapVector<const SCEV *, SmallVector<VPReplicateRecipe *>> LoadsByAddress;
4031+
for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
4032+
auto *VPBB = cast<VPBasicBlock>(Block);
4033+
for (VPRecipeBase &R : *VPBB) {
4034+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
4035+
if (!RepR || RepR->getOpcode() != Instruction::Load ||
4036+
!RepR->isPredicated())
4037+
continue;
4038+
4039+
VPValue *Addr = RepR->getOperand(0);
4040+
const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, SE, L);
4041+
if (!isa<SCEVCouldNotCompute>(AddrSCEV))
4042+
LoadsByAddress[AddrSCEV].push_back(RepR);
4043+
}
4044+
}
4045+
4046+
// For each address, collect loads with complementary masks, sort by
4047+
// dominance, and use the earliest load.
4048+
for (auto &[Addr, Loads] : LoadsByAddress) {
4049+
if (Loads.size() < 2)
4050+
continue;
4051+
4052+
// Collect groups of loads with complementary masks.
4053+
SmallVector<SmallVector<VPReplicateRecipe *, 4>> LoadGroups;
4054+
for (VPReplicateRecipe *&LoadI : Loads) {
4055+
if (!LoadI)
4056+
continue;
4057+
4058+
VPValue *MaskI = LoadI->getMask();
4059+
Type *TypeI = TypeInfo.inferScalarType(LoadI);
4060+
SmallVector<VPReplicateRecipe *, 4> Group;
4061+
Group.push_back(LoadI);
4062+
LoadI = nullptr;
4063+
4064+
// Find all loads with the same type.
4065+
for (VPReplicateRecipe *&LoadJ : Loads) {
4066+
if (!LoadJ)
4067+
continue;
4068+
4069+
Type *TypeJ = TypeInfo.inferScalarType(LoadJ);
4070+
if (TypeI == TypeJ) {
4071+
Group.push_back(LoadJ);
4072+
LoadJ = nullptr;
4073+
}
4074+
}
4075+
4076+
// Check if any load in the group has a complementary mask with another,
4077+
// that is M1 == NOT(M2) or M2 == NOT(M1).
4078+
bool HasComplementaryMask =
4079+
any_of(drop_begin(Group), [MaskI](VPReplicateRecipe *Load) {
4080+
VPValue *MaskJ = Load->getMask();
4081+
return match(MaskI, m_Not(m_Specific(MaskJ))) ||
4082+
match(MaskJ, m_Not(m_Specific(MaskI)));
4083+
});
4084+
4085+
if (HasComplementaryMask)
4086+
LoadGroups.push_back(std::move(Group));
4087+
}
4088+
4089+
// For each group, check memory dependencies and hoist the earliest load.
4090+
for (auto &Group : LoadGroups) {
4091+
// Sort loads by dominance order, with earliest (most dominating) first.
4092+
sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
4093+
return VPDT.properlyDominates(A, B);
4094+
});
4095+
4096+
VPReplicateRecipe *EarliestLoad = Group.front();
4097+
VPBasicBlock *FirstBB = EarliestLoad->getParent();
4098+
VPBasicBlock *LastBB = Group.back()->getParent();
4099+
4100+
// Check that the load doesn't alias with stores between first and last.
4101+
if (!canHoistLoadWithNoAliasCheck(EarliestLoad, FirstBB, LastBB))
4102+
continue;
4103+
4104+
// Collect common metadata from all loads in the group.
4105+
VPIRMetadata CommonMetadata = getCommonLoadMetadata(Group);
4106+
4107+
// Create an unpredicated version of the earliest load with common
4108+
// metadata.
4109+
auto *UnpredicatedLoad = new VPReplicateRecipe(
4110+
EarliestLoad->getUnderlyingInstr(), {EarliestLoad->getOperand(0)},
4111+
/*IsSingleScalar=*/false, /*Mask=*/nullptr, *EarliestLoad, CommonMetadata);
4112+
4113+
UnpredicatedLoad->insertBefore(EarliestLoad);
4114+
4115+
// Replace all loads in the group with the unpredicated load.
4116+
for (VPReplicateRecipe *Load : Group) {
4117+
Load->replaceAllUsesWith(UnpredicatedLoad);
4118+
Load->eraseFromParent();
4119+
}
4120+
}
4121+
}
4122+
}
4123+
39774124
void VPlanTransforms::materializeConstantVectorTripCount(
39784125
VPlan &Plan, ElementCount BestVF, unsigned BestUF,
39794126
PredicatedScalarEvolution &PSE) {

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -314,6 +314,12 @@ struct VPlanTransforms {
314314
/// plan using noalias metadata.
315315
static void hoistInvariantLoads(VPlan &Plan);
316316

317+
/// Hoist predicated loads from the same address to the loop entry block, if
318+
/// they are guaranteed to execute on both paths (i.e., in replicate regions
319+
/// with complementary masks P and NOT P).
320+
static void hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
321+
const Loop *L);
322+
317323
// Materialize vector trip counts for constants early if it can simply be
318324
// computed as (Original TC / VF * UF) * VF * UF.
319325
static void

0 commit comments

Comments
 (0)