Skip to content

Commit cf0df88

Browse files
committed
[LoopVectorize][LAA] Hoist load in memory IV to allow vectorization
Adds VPScalarIVPromotionRecipe recipe to promote memory IV to scalar IV. Use SCEV mutilplied by VFxUF or EVL to multiply the SCEV between load and store. The code that this patch allows vectorization is like this: while.body: %theFirst.addr.0112 = phi ptr [ %incdec.ptr9, %while.body ], [ %theFirst, %while.body.preheader ] %thePointer.0111 = phi ptr [ %incdec.ptr, %while.body ], [ %add.ptr.i, %while.body.preheader ] %1 = load i16, ptr %theFirst.addr.0112, align 2 store i16 %1, ptr %thePointer.0111, align 2 %incdec.ptr = getelementptr inbounds nuw i8, ptr %thePointer.0111, i64 2 %2 = load i64, ptr %m_size_ptr, align 8 %inc = add i64 %2, 1 store i64 %inc, ptr %m_size_ptr, align 8 %incdec.ptr9 = getelementptr inbounds nuw i8, ptr %theFirst.addr.0112, i64 2 %cmp7.not = icmp eq ptr %incdec.ptr9, %theLast br i1 %cmp7.not, label %cleanup.loopexit, label %while.body As you can see, %m_size_ptr is a loop invariant pointer and can be promoted to a IV scalar and then execute vectorization.
1 parent e86d562 commit cf0df88

37 files changed

+463
-213
lines changed

llvm/lib/Analysis/LoopAccessAnalysis.cpp

Lines changed: 22 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2329,6 +2329,19 @@ MemoryDepChecker::getDependenceDistanceStrideAndSize(
23292329
int64_t StrideBPtrInt = *StrideBPtr;
23302330
LLVM_DEBUG(dbgs() << "LAA: Src induction step: " << StrideAPtrInt
23312331
<< " Sink induction step: " << StrideBPtrInt << "\n");
2332+
2333+
if (!StrideAPtrInt && !StrideBPtrInt && !(AIsWrite && BIsWrite) &&
2334+
(AIsWrite || BIsWrite) && !isa<UndefValue>(APtr) &&
2335+
InnermostLoop->isLoopInvariant(APtr) &&
2336+
InnermostLoop->isLoopInvariant(BPtr)) {
2337+
LoadInst *L = dyn_cast<LoadInst>(AIsWrite ? BInst : AInst);
2338+
if (InnermostLoop->isLoopInvariant(L->getPointerOperand()))
2339+
if (L && isInvariantLoadHoistable(L, SE, nullptr, nullptr, nullptr))
2340+
ShouldRetryWithRuntimeChecks = true;
2341+
2342+
return MemoryDepChecker::Dependence::Unknown;
2343+
}
2344+
23322345
// At least Src or Sink are loop invariant and the other is strided or
23332346
// invariant. We can generate a runtime check to disambiguate the accesses.
23342347
if (!StrideAPtrInt || !StrideBPtrInt)
@@ -2942,9 +2955,15 @@ bool LoopAccessInfo::analyzeLoop(AAResults *AA, const LoopInfo *LI,
29422955
// See if there is an unsafe dependency between a load to a uniform address and
29432956
// store to the same uniform address.
29442957
if (UniformStores.contains(Ptr)) {
2945-
LLVM_DEBUG(dbgs() << "LAA: Found an unsafe dependency between a uniform "
2946-
"load and uniform store to the same address!\n");
2947-
HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2958+
auto &SE = *PSE->getSE();
2959+
if (TheLoop->isLoopInvariant(LD->getPointerOperand()) &&
2960+
!getDepChecker().isInvariantLoadHoistable(LD, SE, nullptr, nullptr,
2961+
nullptr)) {
2962+
LLVM_DEBUG(
2963+
dbgs() << "LAA: Found an unsafe dependency between a uniform "
2964+
"load and uniform store to the same address!\n");
2965+
HasLoadStoreDependenceInvolvingLoopInvariantAddress = true;
2966+
}
29482967
}
29492968

29502969
MemoryLocation Loc = MemoryLocation::get(LD);

llvm/lib/Transforms/Scalar/LoopDistribute.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -680,12 +680,14 @@ class LoopDistributeForLoop {
680680

681681
// Currently, we only distribute to isolate the part of the loop with
682682
// dependence cycles to enable partial vectorization.
683-
if (LAI->canVectorizeMemory())
683+
if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() &&
684+
LAI->canVectorizeMemory())
684685
return fail("MemOpsCanBeVectorized",
685686
"memory operations are safe for vectorization");
686687

687688
auto *Dependences = LAI->getDepChecker().getDependences();
688-
if (!Dependences || Dependences->empty())
689+
if (!LAI->hasLoadStoreDependenceInvolvingLoopInvariantAddress() &&
690+
(!Dependences || Dependences->empty()))
689691
return fail("NoUnsafeDeps", "no unsafe dependences to isolate");
690692

691693
LLVM_DEBUG(dbgs() << "LDist: Found a candidate loop: "

llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -630,6 +630,8 @@ class LoopVectorizationPlanner {
630630
VPRecipeBuilder &RecipeBuilder,
631631
ElementCount MinVF);
632632

633+
void adjustScalarIVPromotions(VPlanPtr &Plan);
634+
633635
/// Attach the runtime checks of \p RTChecks to \p Plan.
634636
void attachRuntimeChecks(VPlan &Plan, GeneratedRTChecks &RTChecks,
635637
bool HasBranchWeights) const;

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4092,6 +4092,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
40924092
case VPDef::VPEVLBasedIVPHISC:
40934093
case VPDef::VPPredInstPHISC:
40944094
case VPDef::VPBranchOnMaskSC:
4095+
case VPDef::VPScalarIVPromotionRecipeSC:
40954096
continue;
40964097
case VPDef::VPReductionSC:
40974098
case VPDef::VPActiveLaneMaskPHISC:
@@ -7523,6 +7524,14 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
75237524
OriginalScalarPH->setName("vec.epilog.iter.check");
75247525
VPIRBasicBlock *NewEntry = Plan.createVPIRBasicBlock(OriginalScalarPH);
75257526
VPBasicBlock *OldEntry = Plan.getEntry();
7527+
7528+
for (VPRecipeBase &R : make_early_inc_range(*OldEntry))
7529+
// Move hoisted loads to split PreHeader
7530+
if (auto RepR = dyn_cast<VPReplicateRecipe>(&R)) {
7531+
RepR->removeFromParent();
7532+
VectorPHVPBB->appendRecipe(RepR);
7533+
}
7534+
75267535
for (auto &R : make_early_inc_range(*OldEntry)) {
75277536
// Skip moving VPIRInstructions (including VPIRPhis), which are unmovable by
75287537
// defining.
@@ -7532,6 +7541,7 @@ BasicBlock *EpilogueVectorizerEpilogueLoop::createVectorizedLoopSkeleton() {
75327541
}
75337542

75347543
VPBlockUtils::reassociateBlocks(OldEntry, NewEntry);
7544+
75357545
Plan.setEntry(NewEntry);
75367546
// OldEntry is now dead and will be cleaned up when the plan gets destroyed.
75377547

@@ -8324,6 +8334,23 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
83248334
}
83258335
}
83268336

8337+
void LoopVectorizationPlanner::adjustScalarIVPromotions(VPlanPtr &Plan) {
8338+
VPScalarIVPromotionRecipe *Recipe = nullptr;
8339+
8340+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
8341+
vp_depth_first_deep(Plan->getVectorLoopRegion())))
8342+
for (VPRecipeBase &R : *VPBB)
8343+
if (auto *ScalarIV = dyn_cast<VPScalarIVPromotionRecipe>(&R)) {
8344+
assert(!Recipe && "Only one FFLoad is supported");
8345+
Recipe = ScalarIV;
8346+
}
8347+
8348+
if (!Recipe)
8349+
return;
8350+
8351+
Recipe->setVFxUF(&Plan->getVFxUF());
8352+
}
8353+
83278354
VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
83288355
VPlanPtr Plan, VFRange &Range, LoopVersioning *LVer) {
83298356

@@ -8434,11 +8461,12 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
84348461
// latter are added above for masking.
84358462
// FIXME: Migrate code relying on the underlying instruction from VPlan0
84368463
// to construct recipes below to not use the underlying instruction.
8437-
if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe>(
8438-
&R) ||
8464+
if (isa<VPCanonicalIVPHIRecipe, VPWidenCanonicalIVRecipe, VPBlendRecipe,
8465+
VPScalarIVPromotionRecipe>(&R) ||
84398466
(isa<VPInstruction>(&R) && !UnderlyingValue))
84408467
continue;
8441-
assert(isa<VPInstruction>(&R) && UnderlyingValue && "unsupported recipe");
8468+
assert((isa<VPInstruction, VPReplicateRecipe>(&R) && UnderlyingValue &&
8469+
"unsupported recipe"));
84428470

84438471
// TODO: Gradually replace uses of underlying instruction by analyses on
84448472
// VPlan.
@@ -8514,6 +8542,8 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
85148542
// Adjust the recipes for any inloop reductions.
85158543
adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start);
85168544

8545+
adjustScalarIVPromotions(Plan);
8546+
85178547
// Apply mandatory transformation to handle FP maxnum/minnum reduction with
85188548
// NaNs if possible, bail out otherwise.
85198549
if (!VPlanTransforms::runPass(VPlanTransforms::handleMaxMinNumReductions,

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 55 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -554,6 +554,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
554554
case VPRecipeBase::VPWidenPointerInductionSC:
555555
case VPRecipeBase::VPReductionPHISC:
556556
case VPRecipeBase::VPPartialReductionSC:
557+
case VPRecipeBase::VPScalarIVPromotionRecipeSC:
557558
return true;
558559
case VPRecipeBase::VPBranchOnMaskSC:
559560
case VPRecipeBase::VPInterleaveEVLSC:
@@ -580,10 +581,12 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
580581

581582
/// Returns the underlying instruction.
582583
Instruction *getUnderlyingInstr() {
583-
return cast<Instruction>(getUnderlyingValue());
584+
return getUnderlyingValue() ? dyn_cast<Instruction>(getUnderlyingValue())
585+
: nullptr;
584586
}
585587
const Instruction *getUnderlyingInstr() const {
586-
return cast<Instruction>(getUnderlyingValue());
588+
return getUnderlyingValue() ? dyn_cast<Instruction>(getUnderlyingValue())
589+
: nullptr;
587590
}
588591

589592
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -2312,7 +2315,8 @@ struct VPFirstOrderRecurrencePHIRecipe : public VPHeaderPHIRecipe {
23122315

23132316
VPFirstOrderRecurrencePHIRecipe *clone() override {
23142317
return new VPFirstOrderRecurrencePHIRecipe(
2315-
cast<PHINode>(getUnderlyingInstr()), *getOperand(0));
2318+
getUnderlyingInstr() ? cast<PHINode>(getUnderlyingInstr()) : nullptr,
2319+
*getOperand(0));
23162320
}
23172321

23182322
void execute(VPTransformState &State) override;
@@ -3475,6 +3479,54 @@ class VPExpandSCEVRecipe : public VPSingleDefRecipe {
34753479
const SCEV *getSCEV() const { return Expr; }
34763480
};
34773481

3482+
struct LLVM_ABI_FOR_TEST VPScalarIVPromotionRecipe : public VPSingleDefRecipe {
3483+
VPScalarIVPromotionRecipe(std::initializer_list<VPValue *> Operands,
3484+
DebugLoc DL = DebugLoc::getUnknown())
3485+
: VPSingleDefRecipe(VPDef::VPScalarIVPromotionRecipeSC, Operands, DL) {}
3486+
3487+
VP_CLASSOF_IMPL(VPDef::VPScalarIVPromotionRecipeSC)
3488+
3489+
bool isSingleScalar() const { return true; }
3490+
3491+
VPScalarIVPromotionRecipe *clone() override {
3492+
assert(getNumOperands() == 3 || getNumOperands() == 4);
3493+
if (getNumOperands() == 3)
3494+
return new VPScalarIVPromotionRecipe(
3495+
{getOperand(0), getOperand(1), getOperand(2)}, getDebugLoc());
3496+
return new VPScalarIVPromotionRecipe(
3497+
{getOperand(0), getOperand(1), getOperand(2), getOperand(3)},
3498+
getDebugLoc());
3499+
}
3500+
3501+
VPValue *getVFxUF() { return getOperand(3); }
3502+
void setVFxUF(VPValue *V) {
3503+
if (getNumOperands() == 3) {
3504+
addOperand(V);
3505+
} else {
3506+
setOperand(3, V);
3507+
}
3508+
}
3509+
3510+
void execute(VPTransformState &State) override;
3511+
3512+
InstructionCost computeCost(ElementCount VF,
3513+
VPCostContext &Ctx) const override {
3514+
return 0;
3515+
}
3516+
3517+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3518+
/// Print the recipe.
3519+
void print(raw_ostream &O, const Twine &Indent,
3520+
VPSlotTracker &SlotTracker) const override;
3521+
#endif
3522+
3523+
bool usesScalars(const VPValue *Op) const override {
3524+
assert(is_contained(operands(), Op) &&
3525+
"Op must be an operand of the recipe");
3526+
return true;
3527+
}
3528+
};
3529+
34783530
/// Canonical scalar induction phi of the vector loop. Starting at the specified
34793531
/// start value (either 0 or the resume value when vectorizing the epilogue
34803532
/// loop). VPWidenCanonicalIVRecipe represents the vector version of the

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,7 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
136136
return inferScalarType(R->getOperand(0));
137137
case VPInstruction::BranchOnCond:
138138
case VPInstruction::BranchOnCount:
139+
case Instruction::Store:
139140
return Type::getVoidTy(Ctx);
140141
default:
141142
break;
@@ -289,9 +290,10 @@ Type *VPTypeAnalysis::inferScalarType(const VPValue *V) {
289290
.Case<VPReductionRecipe, VPPredInstPHIRecipe, VPWidenPHIRecipe,
290291
VPScalarIVStepsRecipe, VPWidenGEPRecipe, VPVectorPointerRecipe,
291292
VPVectorEndPointerRecipe, VPWidenCanonicalIVRecipe,
292-
VPPartialReductionRecipe>([this](const VPRecipeBase *R) {
293-
return inferScalarType(R->getOperand(0));
294-
})
293+
VPPartialReductionRecipe, VPScalarIVPromotionRecipe>(
294+
[this](const VPRecipeBase *R) {
295+
return inferScalarType(R->getOperand(0));
296+
})
295297
// VPInstructionWithType must be handled before VPInstruction.
296298
.Case<VPInstructionWithType, VPWidenIntrinsicRecipe,
297299
VPWidenCastRecipe>(

0 commit comments

Comments
 (0)