Skip to content
34 changes: 24 additions & 10 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3961,7 +3961,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks(
[](const auto *R) { return Instruction::Select; })
.Case<VPWidenStoreRecipe>(
[](const auto *R) { return Instruction::Store; })
.Case<VPWidenLoadRecipe>(
.Case<VPWidenLoadRecipe, VPWidenStridedLoadRecipe>(
[](const auto *R) { return Instruction::Load; })
.Case<VPWidenCallRecipe, VPWidenIntrinsicRecipe>(
[](const auto *R) { return Instruction::Call; })
Expand Down Expand Up @@ -4061,6 +4061,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF,
case VPDef::VPReductionPHISC:
case VPDef::VPInterleaveEVLSC:
case VPDef::VPInterleaveSC:
case VPDef::VPWidenStridedLoadSC:
case VPDef::VPWidenLoadEVLSC:
case VPDef::VPWidenLoadSC:
case VPDef::VPWidenStoreEVLSC:
Expand Down Expand Up @@ -6963,6 +6964,12 @@ static bool planContainsAdditionalSimplifications(VPlan &Plan,
RepR->getUnderlyingInstr(), VF))
return true;
}

// The strided load is transformed from a gather through VPlanTransform,
// and its cost will be lower than the original gather.
if (isa<VPWidenStridedLoadRecipe>(&R))
return true;

if (Instruction *UI = GetInstructionForCost(&R)) {
// If we adjusted the predicate of the recipe, the cost in the legacy
// cost model may be different.
Expand Down Expand Up @@ -7520,7 +7527,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef<VPValue *> Operands,
new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I),
/*Stride*/ -1, Flags, I->getDebugLoc());
} else {
VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I),
const DataLayout &DL = I->getDataLayout();
auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType());
VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1));
VPValue *StrideOne = Plan.getConstantInt(StrideTy, 1);

VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne,
GEP ? GEP->getNoWrapFlags()
: GEPNoWrapFlags::none(),
I->getDebugLoc());
Expand Down Expand Up @@ -8420,20 +8430,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
*Plan))
return nullptr;

VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
// Transform recipes to abstract recipes if it is legal and beneficial and
// clamp the range for better cost estimation.
// TODO: Enable following transform when the EVL-version of extended-reduction
// and mulacc-reduction are implemented.
if (!CM.foldTailWithEVL()) {
VPCostContext CostCtx(CM.TTI, *CM.TLI, *Plan, CM, CM.CostKind,
*CM.PSE.getSE());
if (!CM.foldTailWithEVL())
VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan,
CostCtx, Range);
}

for (ElementCount VF : Range)
Plan->addVF(VF);
Plan->setName("Initial VPlan");

// Interleave memory: for each Interleave Group we marked earlier as relevant
// for this VPlan, replace the Recipes widening its memory instructions with a
Expand All @@ -8446,6 +8451,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes(
VPlanTransforms::runPass(VPlanTransforms::replaceSymbolicStrides, *Plan, PSE,
Legal->getLAI()->getSymbolicStrides());

// Convert memory recipes to strided access recipes if the strided access is
// legal and profitable.
VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan,
CostCtx, Range);

for (ElementCount VF : Range)
Plan->addVF(VF);
Plan->setName("Initial VPlan");

auto BlockNeedsPredication = [this](BasicBlock *BB) {
return Legal->blockNeedsPredication(BB);
};
Expand Down
99 changes: 88 additions & 11 deletions llvm/lib/Transforms/Vectorize/VPlan.h
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
case VPRecipeBase::VPInterleaveEVLSC:
case VPRecipeBase::VPInterleaveSC:
case VPRecipeBase::VPIRInstructionSC:
case VPRecipeBase::VPWidenStridedLoadSC:
case VPRecipeBase::VPWidenLoadEVLSC:
case VPRecipeBase::VPWidenLoadSC:
case VPRecipeBase::VPWidenStoreEVLSC:
Expand Down Expand Up @@ -1773,10 +1774,6 @@ struct LLVM_ABI_FOR_TEST VPWidenSelectRecipe : public VPRecipeWithIRFlags,
class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
Type *SourceElementTy;

bool isPointerLoopInvariant() const {
return getOperand(0)->isDefinedOutsideLoopRegions();
}

bool isIndexLoopInvariant(unsigned I) const {
return getOperand(I + 1)->isDefinedOutsideLoopRegions();
}
Expand Down Expand Up @@ -1809,6 +1806,30 @@ class LLVM_ABI_FOR_TEST VPWidenGEPRecipe : public VPRecipeWithIRFlags {
/// This recipe generates a GEP instruction.
unsigned getOpcode() const { return Instruction::GetElementPtr; }

bool isPointerLoopInvariant() const {
return getOperand(0)->isDefinedOutsideLoopRegions();
}
Comment on lines +1809 to +1811
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

unrelated move?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change it to public because determineBaseAndStride calls isPointerLoopInvariant() to temporarily exclude cases where the base address is not invariant.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh I see, thanks


std::optional<unsigned> getUniqueVariantIndex() const {
std::optional<unsigned> VarIdx;
for (unsigned I = 0, E = getNumOperands() - 1; I < E; ++I) {
if (isIndexLoopInvariant(I))
continue;

if (VarIdx)
return std::nullopt;
VarIdx = I;
}
return VarIdx;
}

/// Returns the element type for the first \p I indices of this recipe.
Type *getIndexedType(unsigned I) const {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would be good to document

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

auto *GEP = cast<GetElementPtrInst>(getUnderlyingInstr());
SmallVector<Value *, 4> Ops(GEP->idx_begin(), GEP->idx_begin() + I);
return GetElementPtrInst::getIndexedType(SourceElementTy, Ops);
}

/// Generate the gep nodes.
void execute(VPTransformState &State) override;

Expand Down Expand Up @@ -1899,20 +1920,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags,
#endif
};

/// A recipe to compute the pointers for widened memory accesses of IndexTy.
/// A recipe to compute the pointers for widened memory accesses of
/// SourceElementTy, with the Stride expressed in units of SourceElementTy.
class VPVectorPointerRecipe : public VPRecipeWithIRFlags,
public VPUnrollPartAccessor<1> {
public VPUnrollPartAccessor<2> {
Type *SourceElementTy;

public:
VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy,
VPVectorPointerRecipe(VPValue *Ptr, Type *SourceElementTy, VPValue *Stride,
GEPNoWrapFlags GEPFlags, DebugLoc DL)
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
GEPFlags, DL),
: VPRecipeWithIRFlags(VPDef::VPVectorPointerSC,
ArrayRef<VPValue *>({Ptr, Stride}), GEPFlags, DL),
SourceElementTy(SourceElementTy) {}

VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)

VPValue *getStride() const { return getOperand(1); }

void execute(VPTransformState &State) override;

Type *getSourceElementType() const { return SourceElementTy; }
Expand All @@ -1933,7 +1957,8 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags,

VPVectorPointerRecipe *clone() override {
return new VPVectorPointerRecipe(getOperand(0), SourceElementTy,
getGEPNoWrapFlags(), getDebugLoc());
getStride(), getGEPNoWrapFlags(),
getDebugLoc());
}

/// Return true if this VPVectorPointerRecipe corresponds to part 0. Note that
Expand Down Expand Up @@ -3205,7 +3230,8 @@ class LLVM_ABI_FOR_TEST VPWidenMemoryRecipe : public VPRecipeBase,
return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreSC ||
R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC;
R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC ||
R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC;
}

static inline bool classof(const VPUser *U) {
Expand Down Expand Up @@ -3326,6 +3352,57 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue {
}
};

/// A recipe for strided load operations, using the base address, stride, VF,
/// and an optional mask. This recipe will generate a vp.strided.load intrinsic
/// call to represent memory accesses with a fixed stride.
struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe,
public VPValue {
VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we store the required info independent of the LLVM IR instruction?

It would also help to clarify what information is actually used and why this cannot be simply a VPWidenIntrinsicsRecipe, as it would map 1-1 to an intrinsic

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we store the required info independent of the LLVM IR instruction?

Maybe, if we also store the type information in the recipe. But I want to know, in what scenario would we create a VPWidenStridedLoadRecipe without a LoadInst?

It would also help to clarify what information is actually used and why this cannot be simply a VPWidenIntrinsicsRecipe, as it would map 1-1 to an intrinsic

I tried to do that: Mel-Chen@d19fe61
This approach does help eliminate one recipe VPWidenStridedLoadRecipe, but there are still a few details we need to be careful about:
First, the current VPWidenIntrinsicRecipe cannot set attributes, so alignment information will be lost. This could be resolved by passing an AttributeList to VPWidenIntrinsicRecipe, allowing it to add the attributes during ::execute.

      // ???: How to set alignment?
      auto *StridedLoad = new VPWidenIntrinsicRecipe(
          Ingredient, Intrinsic::experimental_vp_strided_load,
          {NewPtr, StrideInBytes, Mask, I32VF},
          TypeInfo.inferScalarType(LoadR), LoadR->getDebugLoc());

Next, it might become difficult to ensure profitability before generating the strided access (i.e., we may not be able to achieve the change suggested in this comment
). For more accurate profitability analysis, it would be better to call getCostForIntrinsics directly during the profitability check, which requires that all operands are already prepared.

        // Better to make getCostForIntrinsics to utils function, and directly
        // call getCostForIntrinsics to get the cost.
        SmallVector<Type *, 4> ParamTys = {
            TypeInfo.inferScalarType(BasePtr),
            TypeInfo.inferScalarType(StrideInElement),
            Type::getInt1Ty(Plan.getContext()),
            Type::getInt32Ty(Plan.getContext())};
        // FIXME: Copy from getCostForIntrinsics, but I think this is a bug. We
        // don't have to vectorize every operands. Should be fixed in
        // getCostForIntrinsics.
        for (auto &Ty : ParamTys)
          Ty = toVectorTy(Ty, VF);
        IntrinsicCostAttributes CostAttrs(
            Intrinsic::experimental_vp_strided_load, DataTy, {}, ParamTys,
            FastMathFlags(), nullptr, InstructionCost::getInvalid(), &Ctx.TLI);
        const InstructionCost StridedLoadStoreCost =
            Ctx.TTI.getIntrinsicInstrCost(CostAttrs, Ctx.CostKind);
        return StridedLoadStoreCost < CurrentCost;

Finally, using VPWidenIntrinsicRecipe reduces readability during transformations. Currently, VPWidenStridedLoadRecipe provides members like getAddr() and getStride(), which improves readability. This issue is not limited to EVL lowering—it should also have an impact #164205.

      .Case<VPWidenIntrinsicRecipe>(
          [&](VPWidenIntrinsicRecipe *WI) -> VPRecipeBase * {
            if (WI->getVectorIntrinsicID() ==
                Intrinsic::experimental_vp_strided_load) {
              VPWidenIntrinsicRecipe *NewWI = WI->clone();
              if (VPValue *NewMask = GetNewMask(WI->getOperand(2)))
                NewWI->setOperand(2, NewMask);
              else
                NewWI->setOperand(2, &AllOneMask);
              NewWI->setOperand(3, &EVL);
              return NewWI;
            }
            return nullptr;
          })

That’s my take on this. What’s your opinion?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

First, the current VPWidenIntrinsicRecipe cannot set attributes, so alignment information will be lost. This could be resolved by passing an AttributeList to VPWidenIntrinsicRecipe, allowing it to add the attributes during ::execute.

Hm yes, this is a general modeling issue, but could be improved separately (and does not only apply to this intrinsic I think). Is setting the alignment critical for the initial implementation?

Next, it might become difficult to ensure profitability before generating the strided access (i.e., we may not be able to achieve the change suggested in this comment

I am not sure I understand this completely; We should be able to check the profitability up-front regardless of what recipe we create later on, I think?

Finally, using VPWidenIntrinsicRecipe reduces readability during transformations. Currently, VPWidenStridedLoadRecipe provides members like getAddr() and getStride(), which improves readability. This issue is not limited to EVL lowering—it should also have an impact #164205.

I think this could be mostly taken care of by providing a m_StridedLoad() pattern?

VPValue *VF, VPValue *Mask,
const VPIRMetadata &Metadata, DebugLoc DL)
: VPWidenMemoryRecipe(
VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF},
/*Consecutive=*/false, /*Reverse=*/false, Metadata, DL),
VPValue(this, &Load) {
setMask(Mask);
}

VPWidenStridedLoadRecipe *clone() override {
return new VPWidenStridedLoadRecipe(cast<LoadInst>(Ingredient), getAddr(),
getStride(), getVF(), getMask(), *this,
getDebugLoc());
}

VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC);

/// Return the stride operand.
VPValue *getStride() const { return getOperand(1); }

/// Return the VF operand.
VPValue *getVF() const { return getOperand(2); }

/// Generate a strided load.
void execute(VPTransformState &State) override;

/// Return the cost of this VPWidenStridedLoadRecipe.
InstructionCost computeCost(ElementCount VF,
VPCostContext &Ctx) const override;

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
/// Print the recipe.
void print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const override;
#endif

/// Returns true if the recipe only uses the first lane of operand \p Op.
bool onlyFirstLaneUsed(const VPValue *Op) const override {
assert(is_contained(operands(), Op) &&
"Op must be an operand of the recipe");
// All operands except the mask are only used for the first lane.
return Op == getAddr() || Op == getStride() || Op == getVF();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Might be good to document that this includes everything except the mask

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

}
};

/// A recipe for widening store operations, using the stored value, the address
/// to store to and an optional mask.
struct LLVM_ABI_FOR_TEST VPWidenStoreRecipe final : public VPWidenMemoryRecipe {
Expand Down
6 changes: 4 additions & 2 deletions llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,8 +190,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) {
}

Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) {
assert((isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe>(R)) &&
"Store recipes should not define any values");
assert(
(isa<VPWidenLoadRecipe, VPWidenLoadEVLRecipe, VPWidenStridedLoadRecipe>(
R)) &&
"Store recipes should not define any values");
return cast<LoadInst>(&R->getIngredient())->getType();
}

Expand Down
71 changes: 66 additions & 5 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
case VPWidenCastSC:
case VPWidenGEPSC:
case VPWidenIntOrFpInductionSC:
case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenPHISC:
Expand All @@ -106,6 +107,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
return cast<VPExpressionRecipe>(this)->mayReadOrWriteMemory();
case VPInstructionSC:
return cast<VPInstruction>(this)->opcodeMayReadOrWriteFromMemory();
case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
return true;
Expand Down Expand Up @@ -189,6 +191,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
case VPInterleaveEVLSC:
case VPInterleaveSC:
return mayWriteToMemory();
case VPWidenStridedLoadSC:
case VPWidenLoadEVLSC:
case VPWidenLoadSC:
case VPWidenStoreEVLSC:
Expand Down Expand Up @@ -2603,13 +2606,21 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent,
void VPVectorPointerRecipe::execute(VPTransformState &State) {
auto &Builder = State.Builder;
unsigned CurrentPart = getUnrollPart(*this);
Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false,
/*IsUnitStride*/ true, CurrentPart, Builder);
Value *Stride = State.get(getStride(), /*IsScalar*/ true);

auto *StrideC = dyn_cast<ConstantInt>(Stride);
bool IsStrideOne = StrideC && StrideC->isOne();
bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne());
Type *IndexTy =
getGEPIndexTy(State.VF.isScalable(),
/*IsReverse*/ false, IsUnitStride, CurrentPart, Builder);
Value *Ptr = State.get(getOperand(0), VPLane(0));

Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy);
Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart);
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Increment,
"", getGEPNoWrapFlags());
Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride);
Value *ResultPtr = Builder.CreateGEP(getSourceElementType(), Ptr, Index, "",
getGEPNoWrapFlags());

State.set(this, ResultPtr, /*IsScalar*/ true);
}
Expand Down Expand Up @@ -3510,7 +3521,6 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF,

const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
Type *PtrTy = Ptr->getType();

// If the address value is uniform across all lanes, then the address can be
// calculated with scalar type and broadcast.
if (!vputils::isSingleScalar(getAddr()))
Expand Down Expand Up @@ -3665,6 +3675,57 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent,
}
#endif

void VPWidenStridedLoadRecipe::execute(VPTransformState &State) {
Type *ScalarDataTy = getLoadStoreType(&Ingredient);
auto *DataTy = VectorType::get(ScalarDataTy, State.VF);
const Align Alignment = getLoadStoreAlignment(&Ingredient);

auto &Builder = State.Builder;
Value *Addr = State.get(getAddr(), /*IsScalar*/ true);
Value *StrideInBytes = State.get(getStride(), /*IsScalar*/ true);
Value *Mask = nullptr;
if (VPValue *VPMask = getMask())
Mask = State.get(VPMask);
else
Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue());
Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)),
Builder.getInt32Ty());

auto *PtrTy = Addr->getType();
auto *StrideTy = StrideInBytes->getType();
CallInst *NewLI = Builder.CreateIntrinsic(
Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy},
{Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load");
NewLI->addParamAttr(
0, Attribute::getWithAlignment(NewLI->getContext(), Alignment));
applyMetadata(*NewLI);
State.set(this, NewLI);
}

InstructionCost
VPWidenStridedLoadRecipe::computeCost(ElementCount VF,
VPCostContext &Ctx) const {
Type *Ty = toVectorTy(getLoadStoreType(&Ingredient), VF);
const Value *Ptr = getLoadStorePointerOperand(&Ingredient);
const Align Alignment = getLoadStoreAlignment(&Ingredient);
return Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, Ty, Ptr, IsMasked,
Alignment, Ctx.CostKind, &Ingredient);
}

#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent,
VPSlotTracker &SlotTracker) const {
O << Indent << "WIDEN ";
printAsOperand(O, SlotTracker);
O << " = load ";
getAddr()->printAsOperand(O, SlotTracker);
O << ", stride = ";
getStride()->printAsOperand(O, SlotTracker);
O << ", runtimeVF = ";
getVF()->printAsOperand(O, SlotTracker);
}
#endif

void VPWidenStoreRecipe::execute(VPTransformState &State) {
VPValue *StoredVPValue = getStoredValue();
bool CreateScatter = !isConsecutive();
Expand Down
Loading