Skip to content

Commit cf7073b

Browse files
committed
[VPlan] Hoist predicated loads with complementary masks. (llvm#168373)
This patch adds a new VPlan transformation to hoist predicated loads, if we can prove they execute unconditionally, i.e. there are 2 predicated loads to the same address with complementary masks. Then we are guaranteed to execute one of them on each iteration, allowing us to remove the mask. The transform groups masked replicating loads by their address SCEV, then checks if there are 2 loads with complementary mask. If that is the case, we check if there are any writes that may alias the load address in the blocks between the first and last load with the same address. The transforms operates after linearizing the CFG, but before introducing replicate regions, which means this is just checking a chain of consecutive blocks. Currently this only uses noalias metadata to check for no-alias (using the helpers added in llvm#166247). Then we create an unpredicated VPReplicateRecipe at the position of the first load, then replace all users of the grouped loads with it. Small Alive2 proof for hoisting with complementary masks: https://alive2.llvm.org/ce/z/kUx742 PR: llvm#168373 (cherry picked from commit 4cc8cc8)
1 parent 1ecd2de commit cf7073b

File tree

5 files changed

+269
-369
lines changed

5 files changed

+269
-369
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8338,6 +8338,7 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
83388338
std::unique_ptr<VPlan>(VPlan0->duplicate()), SubRange, &LVer)) {
83398339
bool HasScalarVF = Plan->hasScalarVFOnly();
83408340
// Now optimize the initial VPlan.
8341+
VPlanTransforms::hoistPredicatedLoads(*Plan, *PSE.getSE(), OrigLoop);
83418342
if (!HasScalarVF)
83428343
VPlanTransforms::runPass(VPlanTransforms::truncateToMinimalBitwidths,
83438344
*Plan, CM.getMinimalBitwidths());

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 153 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,41 @@ bool VPlanTransforms::tryToConvertVPInstructionsToVPRecipes(
130130
return true;
131131
}
132132

133+
// Check if a load can be hoisted by verifying it doesn't alias with any stores
134+
// in blocks between FirstBB and LastBB using scoped noalias metadata.
135+
static bool canHoistLoadWithNoAliasCheck(VPReplicateRecipe *Load,
136+
VPBasicBlock *FirstBB,
137+
VPBasicBlock *LastBB) {
138+
// Get the load's memory location and check if it aliases with any stores
139+
// using scoped noalias metadata.
140+
auto LoadLoc = vputils::getMemoryLocation(*Load);
141+
if (!LoadLoc || !LoadLoc->AATags.Scope)
142+
return false;
143+
144+
const AAMDNodes &LoadAA = LoadLoc->AATags;
145+
for (VPBlockBase *Block = FirstBB; Block;
146+
Block = Block->getSingleSuccessor()) {
147+
// This function assumes a simple linear chain of blocks. If there are
148+
// multiple successors, we would need more complex analysis.
149+
assert(Block->getNumSuccessors() <= 1 &&
150+
"Expected at most one successor in block chain");
151+
auto *VPBB = cast<VPBasicBlock>(Block);
152+
for (VPRecipeBase &R : *VPBB) {
153+
if (R.mayWriteToMemory()) {
154+
auto Loc = vputils::getMemoryLocation(R);
155+
// Bail out if we can't get the location or if the scoped noalias
156+
// metadata indicates potential aliasing.
157+
if (!Loc || ScopedNoAliasAAResult::mayAliasInScopes(
158+
LoadAA.Scope, Loc->AATags.NoAlias))
159+
return false;
160+
}
161+
}
162+
if (Block == LastBB)
163+
break;
164+
}
165+
return true;
166+
}
167+
133168
static bool sinkScalarOperands(VPlan &Plan) {
134169
auto Iter = vp_depth_first_deep(Plan.getEntry());
135170
bool Changed = false;
@@ -3151,6 +3186,124 @@ void VPlanTransforms::hoistInvariantLoads(VPlan &Plan) {
31513186
}
31523187
}
31533188
}
3189+
3190+
// Returns the intersection of metadata from a group of loads.
3191+
static VPIRMetadata getCommonLoadMetadata(ArrayRef<VPReplicateRecipe *> Loads) {
3192+
VPIRMetadata CommonMetadata = *Loads.front();
3193+
for (VPReplicateRecipe *Load : drop_begin(Loads))
3194+
CommonMetadata.intersect(*Load);
3195+
return CommonMetadata;
3196+
}
3197+
3198+
void VPlanTransforms::hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
3199+
const Loop *L) {
3200+
using namespace VPlanPatternMatch;
3201+
VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion();
3202+
VPTypeAnalysis TypeInfo(Plan);
3203+
VPDominatorTree VPDT(Plan);
3204+
3205+
// Group predicated loads by their address SCEV.
3206+
DenseMap<const SCEV *, SmallVector<VPReplicateRecipe *>> LoadsByAddress;
3207+
for (VPBlockBase *Block : vp_depth_first_shallow(LoopRegion->getEntry())) {
3208+
auto *VPBB = cast<VPBasicBlock>(Block);
3209+
for (VPRecipeBase &R : *VPBB) {
3210+
auto *RepR = dyn_cast<VPReplicateRecipe>(&R);
3211+
if (!RepR || RepR->getOpcode() != Instruction::Load ||
3212+
!RepR->isPredicated())
3213+
continue;
3214+
3215+
VPValue *Addr = RepR->getOperand(0);
3216+
const SCEV *AddrSCEV = vputils::getSCEVExprForVPValue(Addr, SE, L);
3217+
if (!isa<SCEVCouldNotCompute>(AddrSCEV))
3218+
LoadsByAddress[AddrSCEV].push_back(RepR);
3219+
}
3220+
}
3221+
3222+
// For each address, collect loads with complementary masks, sort by
3223+
// dominance, and use the earliest load.
3224+
for (auto &[Addr, Loads] : LoadsByAddress) {
3225+
if (Loads.size() < 2)
3226+
continue;
3227+
3228+
// Collect groups of loads with complementary masks.
3229+
SmallVector<SmallVector<VPReplicateRecipe *, 4>> LoadGroups;
3230+
for (VPReplicateRecipe *&LoadI : Loads) {
3231+
if (!LoadI)
3232+
continue;
3233+
3234+
VPValue *MaskI = LoadI->getMask();
3235+
Type *TypeI = TypeInfo.inferScalarType(LoadI);
3236+
SmallVector<VPReplicateRecipe *, 4> Group;
3237+
Group.push_back(LoadI);
3238+
LoadI = nullptr;
3239+
3240+
// Find all loads with the same type.
3241+
for (VPReplicateRecipe *&LoadJ : Loads) {
3242+
if (!LoadJ)
3243+
continue;
3244+
3245+
Type *TypeJ = TypeInfo.inferScalarType(LoadJ);
3246+
if (TypeI == TypeJ) {
3247+
Group.push_back(LoadJ);
3248+
LoadJ = nullptr;
3249+
}
3250+
}
3251+
3252+
// Check if any load in the group has a complementary mask with another,
3253+
// that is M1 == NOT(M2) or M2 == NOT(M1).
3254+
bool HasComplementaryMask =
3255+
any_of(drop_begin(Group), [MaskI](VPReplicateRecipe *Load) {
3256+
VPValue *MaskJ = Load->getMask();
3257+
return match(MaskI, m_Not(m_Specific(MaskJ))) ||
3258+
match(MaskJ, m_Not(m_Specific(MaskI)));
3259+
});
3260+
3261+
if (HasComplementaryMask)
3262+
LoadGroups.push_back(std::move(Group));
3263+
}
3264+
3265+
// For each group, check memory dependencies and hoist the earliest load.
3266+
for (auto &Group : LoadGroups) {
3267+
// Sort loads by dominance order, with earliest (most dominating) first.
3268+
sort(Group, [&VPDT](VPReplicateRecipe *A, VPReplicateRecipe *B) {
3269+
return VPDT.properlyDominates(A, B);
3270+
});
3271+
3272+
VPReplicateRecipe *EarliestLoad = Group.front();
3273+
VPBasicBlock *FirstBB = EarliestLoad->getParent();
3274+
VPBasicBlock *LastBB = Group.back()->getParent();
3275+
3276+
// Check that the load doesn't alias with stores between first and last.
3277+
if (!canHoistLoadWithNoAliasCheck(EarliestLoad, FirstBB, LastBB))
3278+
continue;
3279+
3280+
// Find the load with minimum alignment to use.
3281+
auto *LoadWithMinAlign =
3282+
*min_element(Group, [](VPReplicateRecipe *A, VPReplicateRecipe *B) {
3283+
return cast<LoadInst>(A->getUnderlyingInstr())->getAlign() <
3284+
cast<LoadInst>(B->getUnderlyingInstr())->getAlign();
3285+
});
3286+
3287+
// Collect common metadata from all loads in the group.
3288+
VPIRMetadata CommonMetadata = getCommonLoadMetadata(Group);
3289+
3290+
// Create an unpredicated load with minimum alignment using the earliest
3291+
// dominating address and common metadata.
3292+
auto *UnpredicatedLoad = new VPReplicateRecipe(
3293+
LoadWithMinAlign->getUnderlyingInstr(), EarliestLoad->getOperand(0),
3294+
/*IsSingleScalar=*/false, /*Mask=*/nullptr,
3295+
CommonMetadata);
3296+
UnpredicatedLoad->insertBefore(EarliestLoad);
3297+
3298+
// Replace all loads in the group with the unpredicated load.
3299+
for (VPReplicateRecipe *Load : Group) {
3300+
Load->replaceAllUsesWith(UnpredicatedLoad);
3301+
Load->eraseFromParent();
3302+
}
3303+
}
3304+
}
3305+
}
3306+
31543307
/// Returns true if \p V is VPWidenLoadRecipe or VPInterleaveRecipe that can be
31553308
/// converted to a narrower recipe. \p V is used by a wide recipe that feeds a
31563309
/// store interleave group at index \p Idx, \p WideMember0 is the recipe feeding

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -235,6 +235,12 @@ struct VPlanTransforms {
235235
/// plan using noalias metadata.
236236
static void hoistInvariantLoads(VPlan &Plan);
237237

238+
/// Hoist predicated loads from the same address to the loop entry block, if
239+
/// they are guaranteed to execute on both paths (i.e., in replicate regions
240+
/// with complementary masks P and NOT P).
241+
static void hoistPredicatedLoads(VPlan &Plan, ScalarEvolution &SE,
242+
const Loop *L);
243+
238244
/// Try to convert a plan with interleave groups with VF elements to a plan
239245
/// with the interleave groups replaced by wide loads and stores processing VF
240246
/// elements, if all transformed interleave groups access the full vector

0 commit comments

Comments
 (0)