Skip to content

Commit 65c7d0b

Browse files
committed
PR147297
1 parent 3c0f7b1 commit 65c7d0b

15 files changed

+668
-252
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 23 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3956,7 +3956,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
39563956
[](const auto *R) { return Instruction::Select; })
39573957
.Case<VPWidenStoreRecipe>(
39583958
[](const auto *R) { return Instruction::Store; })
3959-
.Case<VPWidenLoadRecipe>(
3959+
.Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
39603960
[](const auto *R) { return Instruction::Load; })
39613961
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
39623962
[](const auto *R) { return Instruction::Call; })
@@ -4056,6 +4056,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
40564056
case VPDef::VPReductionPHISC:
40574057
case VPDef::VPInterleaveEVLSC:
40584058
case VPDef::VPInterleaveSC:
4059+
case VPDef::VPWidenStridedLoadSC:
40594060
case VPDef::VPWidenLoadEVLSC:
40604061
case VPDef::VPWidenLoadSC:
40614062
case VPDef::VPWidenStoreEVLSC:
@@ -6940,6 +6941,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
69406941
RepR->getUnderlyingInstr(), VF))
69416942
return true;
69426943
}
6944+
6945+
// The strided load is transformed from a gather through VPlanTransform,
6946+
// and its cost will be lower than the original gather.
6947+
if (isa<VPWidenStridedLoadRecipe>(&R))
6948+
return true;
6949+
69436950
if (Instruction *UI = GetInstructionForCost(&R)) {
69446951
// If we adjusted the predicate of the recipe, the cost in the legacy
69456952
// cost model may be different.
@@ -7495,7 +7502,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
74957502
new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
74967503
/*Stride*/ -1, Flags, I->getDebugLoc());
74977504
} else {
7498-
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
7505+
const DataLayout &DL = I->getDataLayout();
7506+
auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
7507+
VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
7508+
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
74997509
GEP ? GEP->getNoWrapFlags()
75007510
: GEPNoWrapFlags::none(),
75017511
I->getDebugLoc());
@@ -8592,19 +8602,14 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
85928602
*Plan))
85938603
return nullptr;
85948604

8605+
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
85958606
// Transform recipes to abstract recipes if it is legal and beneficial and
85968607
// clamp the range for better cost estimation.
85978608
// TODO: Enable following transform when the EVL-version of extended-reduction
85988609
// and mulacc-reduction are implemented.
8599-
if (!CM.foldTailWithEVL()) {
8600-
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind);
8610+
if (!CM.foldTailWithEVL())
86018611
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
86028612
CostCtx, Range);
8603-
}
8604-
8605-
for (ElementCount VF : Range)
8606-
Plan->addVF(VF);
8607-
Plan->setName("Initial VPlan");
86088613

86098614
// Interleave memory: for each Interleave Group we marked earlier as relevant
86108615
// for this VPlan, replace the Recipes widening its memory instructions with a
@@ -8617,6 +8622,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
86178622
VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
86188623
Legal->getLAI()->getSymbolicStrides());
86198624

8625+
// Convert memory recipes to strided access recipes if the strided access is
8626+
// legal and profitable.
8627+
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
8628+
CostCtx, Range);
8629+
8630+
for (ElementCount VF : Range)
8631+
Plan->addVF(VF);
8632+
Plan->setName("Initial VPlan");
8633+
86208634
auto BlockNeedsPredication = [this](BasicBlock *BB) {
86218635
return Legal->blockNeedsPredication(BB);
86228636
};

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 82 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -563,6 +563,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
563563
case VPRecipeBase::VPInterleaveEVLSC:
564564
case VPRecipeBase::VPInterleaveSC:
565565
case VPRecipeBase::VPIRInstructionSC:
566+
case VPRecipeBase::VPWidenStridedLoadSC:
566567
case VPRecipeBase::VPWidenLoadEVLSC:
567568
case VPRecipeBase::VPWidenLoadSC:
568569
case VPRecipeBase::VPWidenStoreEVLSC:
@@ -1769,10 +1770,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
17691770
class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
17701771
Type *SourceElementTy;
17711772

1772-
bool isPointerLoopInvariant() const {
1773-
return getOperand(0)->isDefinedOutsideLoopRegions();
1774-
}
1775-
17761773
bool isIndexLoopInvariant(unsigned I) const {
17771774
return getOperand(I + 1)->isDefinedOutsideLoopRegions();
17781775
}
@@ -1805,6 +1802,29 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
18051802
/// This recipe generates a GEP instruction.
18061803
unsigned getOpcode() const { return Instruction::GetElementPtr; }
18071804

1805+
bool isPointerLoopInvariant() const {
1806+
return getOperand(0)->isDefinedOutsideLoopRegions();
1807+
}
1808+
1809+
std::optional<unsigned> getUniqueVariantIndex() const {
1810+
std::optional<unsigned> VarIdx;
1811+
for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
1812+
if (isIndexLoopInvariant(I))
1813+
continue;
1814+
1815+
if (VarIdx)
1816+
return std::nullopt;
1817+
VarIdx = I;
1818+
}
1819+
return VarIdx;
1820+
}
1821+
1822+
Type *getIndexedType(unsigned I) const {
1823+
auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
1824+
SmallVector<Value *, 4> Ops(GEP->idx_begin(), GEP->idx_begin() + I);
1825+
return GetElementPtrInst::getIndexedType(SourceElementTy, Ops);
1826+
}
1827+
18081828
/// Generate the gep nodes.
18091829
void execute(VPTransformState &State) override;
18101830

@@ -1895,20 +1915,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
18951915
#endif
18961916
};
18971917

1898-
/// A recipe to compute the pointers for widened memory accesses of IndexTy.
1918+
/// A recipe to compute the pointers for widened memory accesses of IndexedTy,
1919+
/// with the Stride expressed in units of IndexedTy.
18991920
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
1900-
public VPUnrollPartAccessor<1> {
1921+
public VPUnrollPartAccessor<2> {
19011922
Type *SourceElementTy;
19021923

19031924
public:
1904-
VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy,
1925+
VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy, VPValue *Stride,
19051926
GEPNoWrapFlags GEPFlags, DebugLoc DL)
1906-
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
1907-
GEPFlags, DL),
1927+
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC,
1928+
ArrayRef<VPValue *>({Ptr, Stride}), GEPFlags, DL),
19081929
SourceElementTy(SourceElementTy) {}
19091930

19101931
VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
19111932

1933+
VPValue *getStride() const { return getOperand(1); }
1934+
19121935
void execute(VPTransformState &State) override;
19131936

19141937
Type *getSourceElementType() const { return SourceElementTy; }
@@ -1929,7 +1952,8 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
19291952

19301953
VPVectorPointerRecipe *clone() override {
19311954
return new VPVectorPointerRecipe(getOperand(0), SourceElementTy,
1932-
getGEPNoWrapFlags(), getDebugLoc());
1955+
getStride(), getGEPNoWrapFlags(),
1956+
getDebugLoc());
19331957
}
19341958

19351959
/// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that
@@ -3186,7 +3210,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
31863210
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
31873211
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
31883212
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
3189-
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
3213+
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
3214+
R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
31903215
}
31913216

31923217
static inline bool classof(const VPUser *U) {
@@ -3307,6 +3332,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
33073332
}
33083333
};
33093334

3335+
/// A recipe for strided load operations, using the base address, stride, and an
3336+
/// optional mask. This recipe will generate an vp.strided.load intrinsic call
3337+
/// to represent memory accesses with a fixed stride.
3338+
struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
3339+
public VPValue {
3340+
VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
3341+
VPValue *VF, VPValue *Mask,
3342+
const VPIRMetadata &Metadata, DebugLoc DL)
3343+
: VPWidenMemoryRecipe(
3344+
VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
3345+
/*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
3346+
VPValue(this, &Load) {
3347+
setMask(Mask);
3348+
}
3349+
3350+
VPWidenStridedLoadRecipe *clone() override {
3351+
return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
3352+
getStride(), getVF(), getMask(), *this,
3353+
getDebugLoc());
3354+
}
3355+
3356+
VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);
3357+
3358+
/// Return the stride operand.
3359+
VPValue *getStride() const { return getOperand(1); }
3360+
3361+
/// Return the VF operand.
3362+
VPValue *getVF() const { return getOperand(2); }
3363+
3364+
/// Generate a strided load.
3365+
void execute(VPTransformState &State) override;
3366+
3367+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3368+
/// Print the recipe.
3369+
void print(raw_ostream &O, const Twine &Indent,
3370+
VPSlotTracker &SlotTracker) const override;
3371+
#endif
3372+
3373+
/// Returns true if the recipe only uses the first lane of operand \p Op.
3374+
bool onlyFirstLaneUsed(const VPValue *Op) const override {
3375+
assert(is_contained(operands(), Op) &&
3376+
"Op must be an operand of the recipe");
3377+
return Op == getAddr() || Op == getStride() || Op == getVF();
3378+
}
3379+
};
3380+
33103381
/// A recipe for widening store operations, using the stored value, the address
33113382
/// to store to and an optional mask.
33123383
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {

llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -188,8 +188,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
188188
}
189189

190190
Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
191-
assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
192-
"Store recipes should not define any values");
191+
assert(
192+
(isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
193+
R)) &&
194+
"Store recipes should not define any values");
193195
return cast<LoadInst>(&R->getIngredient())->getType();
194196
}
195197

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 65 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
8282
case VPWidenCastSC:
8383
case VPWidenGEPSC:
8484
case VPWidenIntOrFpInductionSC:
85+
case VPWidenStridedLoadSC:
8586
case VPWidenLoadEVLSC:
8687
case VPWidenLoadSC:
8788
case VPWidenPHISC:
@@ -105,6 +106,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
105106
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
106107
case VPInstructionSC:
107108
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
109+
case VPWidenStridedLoadSC:
108110
case VPWidenLoadEVLSC:
109111
case VPWidenLoadSC:
110112
return true;
@@ -188,6 +190,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
188190
case VPInterleaveEVLSC:
189191
case VPInterleaveSC:
190192
return mayWriteToMemory();
193+
case VPWidenStridedLoadSC:
191194
case VPWidenLoadEVLSC:
192195
case VPWidenLoadSC:
193196
case VPWidenStoreEVLSC:
@@ -2581,13 +2584,21 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
25812584
void VPVectorPointerRecipe::execute(VPTransformState &State) {
25822585
auto &Builder = State.Builder;
25832586
unsigned CurrentPart = getUnrollPart(*this);
2584-
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
2585-
/*IsUnitStride*/ true, CurrentPart, Builder);
2587+
Value *Stride = State.get(getStride(), /*IsScalar*/ true);
2588+
2589+
auto *StrideC = dyn_cast<ConstantInt>(Stride);
2590+
bool IsStrideOne = StrideC && StrideC->isOne();
2591+
bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne());
2592+
Type *IndexTy =
2593+
getGEPIndexTy(State.VF.isScalable(),
2594+
/*IsReverse*/ false, IsUnitStride, CurrentPart, Builder);
25862595
Value *Ptr = State.get(getOperand(0), VPLane(0));
25872596

2597+
Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy);
25882598
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
2589-
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
2590-
"", getGEPNoWrapFlags());
2599+
Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride);
2600+
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Index, "",
2601+
getGEPNoWrapFlags());
25912602

25922603
State.set(this, ResultPtr, /*IsScalar*/ true);
25932604
}
@@ -3355,9 +3366,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
33553366
const Align Alignment = getLoadStoreAlignment(&Ingredient);
33563367
unsigned AS = cast<PointerType>(Ctx.Types.inferScalarType(getAddr()))
33573368
->getAddressSpace();
3358-
unsigned Opcode = isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(this)
3359-
? Instruction::Load
3360-
: Instruction::Store;
3369+
unsigned Opcode =
3370+
isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
3371+
this)
3372+
? Instruction::Load
3373+
: Instruction::Store;
33613374

33623375
if (!Consecutive) {
33633376
// TODO: Using the original IR may not be accurate.
@@ -3367,8 +3380,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,
33673380
"Inconsecutive memory access should not have the order.");
33683381

33693382
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
3370-
Type *PtrTy = Ptr->getType();
3383+
if (isa<VPWidenStridedLoadRecipe>(this))
3384+
return Ctx.TTI.getStridedMemoryOpCost(
3385+
Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient);
33713386

3387+
Type *PtrTy = Ptr->getType();
33723388
// If the address value is uniform across all lanes, then the address can be
33733389
// calculated with scalar type and broadcast.
33743390
if (!vputils::isSingleScalar(getAddr()))
@@ -3523,6 +3539,47 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
35233539
}
35243540
#endif
35253541

3542+
void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
3543+
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
3544+
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
3545+
const Align Alignment = getLoadStoreAlignment(&Ingredient);
3546+
3547+
auto &Builder = State.Builder;
3548+
Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
3549+
Value *StrideInBytes = State.get(getStride(), /*IsScalar*/ true);
3550+
Value *Mask = nullptr;
3551+
if (VPValue *VPMask = getMask())
3552+
Mask = State.get(VPMask);
3553+
else
3554+
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
3555+
Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
3556+
Builder.getInt32Ty());
3557+
3558+
auto *PtrTy = Addr->getType();
3559+
auto *StrideTy = StrideInBytes->getType();
3560+
CallInst *NewLI = Builder.CreateIntrinsic(
3561+
Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
3562+
{Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load");
3563+
NewLI->addParamAttr(
3564+
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
3565+
applyMetadata(*NewLI);
3566+
State.set(this, NewLI);
3567+
}
3568+
3569+
#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
3570+
void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
3571+
VPSlotTracker &SlotTracker) const {
3572+
O << Indent << "WIDEN ";
3573+
printAsOperand(O, SlotTracker);
3574+
O << " = load ";
3575+
getAddr()->printAsOperand(O, SlotTracker);
3576+
O << ", stride = ";
3577+
getStride()->printAsOperand(O, SlotTracker);
3578+
O << ", runtimeVF = ";
3579+
getVF()->printAsOperand(O, SlotTracker);
3580+
}
3581+
#endif
3582+
35263583
void VPWidenStoreRecipe::execute(VPTransformState &State) {
35273584
VPValue *StoredVPValue = getStoredValue();
35283585
bool CreateScatter = !isConsecutive();

0 commit comments

Comments
 (0)