diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index e7bae17dd2ceb..13609a279b4b5 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4183,7 +4183,7 @@ void LoopVectorizationPlanner::emitInvalidCostRemarks( [](const auto *R) { return Instruction::Select; }) .Case( [](const auto *R) { return Instruction::Store; }) - .Case( + .Case( [](const auto *R) { return Instruction::Load; }) .Case( [](const auto *R) { return Instruction::Call; }) @@ -4282,6 +4282,7 @@ static bool willGenerateVectors(VPlan &Plan, ElementCount VF, case VPDef::VPWidenPointerInductionSC: case VPDef::VPReductionPHISC: case VPDef::VPInterleaveSC: + case VPDef::VPWidenStridedLoadSC: case VPDef::VPWidenLoadEVLSC: case VPDef::VPWidenLoadSC: case VPDef::VPWidenStoreEVLSC: @@ -7769,7 +7770,10 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, ArrayRef Operands, new VPVectorEndPointerRecipe(Ptr, &Plan.getVF(), getLoadStoreType(I), /*Stride*/ -1, Flags, I->getDebugLoc()); } else { - VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), + const DataLayout &DL = I->getDataLayout(); + auto *StrideTy = DL.getIndexType(Ptr->getUnderlyingValue()->getType()); + VPValue *StrideOne = Plan.getOrAddLiveIn(ConstantInt::get(StrideTy, 1)); + VectorPtr = new VPVectorPointerRecipe(Ptr, getLoadStoreType(I), StrideOne, GEP ? GEP->getNoWrapFlags() : GEPNoWrapFlags::none(), I->getDebugLoc()); @@ -8908,20 +8912,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( // Adjust the recipes for any inloop reductions. adjustRecipesForReductions(Plan, RecipeBuilder, Range.Start); + VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, + CM.CostKind); // Transform recipes to abstract recipes if it is legal and beneficial and // clamp the range for better cost estimation. // TODO: Enable following transform when the EVL-version of extended-reduction // and mulacc-reduction are implemented. - if (!CM.foldTailWithEVL()) { - VPCostContext CostCtx(CM.TTI, *CM.TLI, Legal->getWidestInductionType(), CM, - CM.CostKind); + if (!CM.foldTailWithEVL()) VPlanTransforms::runPass(VPlanTransforms::convertToAbstractRecipes, *Plan, CostCtx, Range); - } - - for (ElementCount VF : Range) - Plan->addVF(VF); - Plan->setName("Initial VPlan"); // Interleave memory: for each Interleave Group we marked earlier as relevant // for this VPlan, replace the Recipes widening its memory instructions with a @@ -8930,6 +8929,15 @@ VPlanPtr LoopVectorizationPlanner::tryToBuildVPlanWithVPRecipes( InterleaveGroups, RecipeBuilder, CM.isScalarEpilogueAllowed()); + // Convert memory recipes to strided access recipes if the strided access is + // legal and profitable. + VPlanTransforms::runPass(VPlanTransforms::convertToStridedAccesses, *Plan, + CostCtx, Range); + + for (ElementCount VF : Range) + Plan->addVF(VF); + Plan->setName("Initial VPlan"); + // Replace VPValues for known constant strides guaranteed by predicate scalar // evolution. auto CanUseVersionedStride = [&Plan](VPUser &U, unsigned) { diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 356af4a0e74e4..7e02387bd94e8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -557,6 +557,7 @@ class VPSingleDefRecipe : public VPRecipeBase, public VPValue { case VPRecipeBase::VPBranchOnMaskSC: case VPRecipeBase::VPInterleaveSC: case VPRecipeBase::VPIRInstructionSC: + case VPRecipeBase::VPWidenStridedLoadSC: case VPRecipeBase::VPWidenLoadEVLSC: case VPRecipeBase::VPWidenLoadSC: case VPRecipeBase::VPWidenStoreEVLSC: @@ -1643,20 +1644,6 @@ struct VPWidenSelectRecipe : public VPRecipeWithIRFlags, public VPIRMetadata { /// A recipe for handling GEP instructions. class VPWidenGEPRecipe : public VPRecipeWithIRFlags { - bool isPointerLoopInvariant() const { - return getOperand(0)->isDefinedOutsideLoopRegions(); - } - - bool isIndexLoopInvariant(unsigned I) const { - return getOperand(I + 1)->isDefinedOutsideLoopRegions(); - } - - bool areAllOperandsInvariant() const { - return all_of(operands(), [](VPValue *Op) { - return Op->isDefinedOutsideLoopRegions(); - }); - } - public: VPWidenGEPRecipe(GetElementPtrInst *GEP, ArrayRef Operands) : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) { @@ -1675,6 +1662,20 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags { VP_CLASSOF_IMPL(VPDef::VPWidenGEPSC) + bool isPointerLoopInvariant() const { + return getOperand(0)->isDefinedOutsideLoopRegions(); + } + + bool isIndexLoopInvariant(unsigned I) const { + return getOperand(I + 1)->isDefinedOutsideLoopRegions(); + } + + bool areAllOperandsInvariant() const { + return all_of(operands(), [](VPValue *Op) { + return Op->isDefinedOutsideLoopRegions(); + }); + } + /// Generate the gep nodes. void execute(VPTransformState &State) override; @@ -1763,20 +1764,23 @@ class VPVectorEndPointerRecipe : public VPRecipeWithIRFlags, #endif }; -/// A recipe to compute the pointers for widened memory accesses of IndexTy. +/// A recipe to compute the pointers for widened memory accesses of IndexedTy, +/// with the Stride expressed in units of IndexedTy. class VPVectorPointerRecipe : public VPRecipeWithIRFlags, - public VPUnrollPartAccessor<1> { + public VPUnrollPartAccessor<2> { Type *IndexedTy; public: - VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, GEPNoWrapFlags GEPFlags, - DebugLoc DL) - : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef(Ptr), - GEPFlags, DL), + VPVectorPointerRecipe(VPValue *Ptr, Type *IndexedTy, VPValue *Stride, + GEPNoWrapFlags GEPFlags, DebugLoc DL) + : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, + ArrayRef({Ptr, Stride}), GEPFlags, DL), IndexedTy(IndexedTy) {} VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC) + VPValue *getStride() const { return getOperand(1); } + void execute(VPTransformState &State) override; bool onlyFirstLaneUsed(const VPValue *Op) const override { @@ -1794,7 +1798,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, } VPVectorPointerRecipe *clone() override { - return new VPVectorPointerRecipe(getOperand(0), IndexedTy, + return new VPVectorPointerRecipe(getOperand(0), IndexedTy, getStride(), getGEPNoWrapFlags(), getDebugLoc()); } @@ -2928,7 +2932,8 @@ class VPWidenMemoryRecipe : public VPRecipeBase, public VPIRMetadata { return R->getVPDefID() == VPRecipeBase::VPWidenLoadSC || R->getVPDefID() == VPRecipeBase::VPWidenStoreSC || R->getVPDefID() == VPRecipeBase::VPWidenLoadEVLSC || - R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC; + R->getVPDefID() == VPRecipeBase::VPWidenStoreEVLSC || + R->getVPDefID() == VPRecipeBase::VPWidenStridedLoadSC; } static inline bool classof(const VPUser *U) { @@ -3047,6 +3052,52 @@ struct VPWidenLoadEVLRecipe final : public VPWidenMemoryRecipe, public VPValue { } }; +/// A recipe for strided load operations, using the base address, stride, and an +/// optional mask. This recipe will generate an vp.strided.load intrinsic call +/// to represent memory accesses with a fixed stride. +struct VPWidenStridedLoadRecipe final : public VPWidenMemoryRecipe, + public VPValue { + VPWidenStridedLoadRecipe(LoadInst &Load, VPValue *Addr, VPValue *Stride, + VPValue *VF, VPValue *Mask, + const VPIRMetadata &Metadata, DebugLoc DL) + : VPWidenMemoryRecipe( + VPDef::VPWidenStridedLoadSC, Load, {Addr, Stride, VF}, + /*Consecutive=*/false, /*Reverse=*/false, Metadata, DL), + VPValue(this, &Load) { + setMask(Mask); + } + + VPWidenStridedLoadRecipe *clone() override { + return new VPWidenStridedLoadRecipe(cast(Ingredient), getAddr(), + getStride(), getVF(), getMask(), *this, + getDebugLoc()); + } + + VP_CLASSOF_IMPL(VPDef::VPWidenStridedLoadSC); + + /// Return the stride operand. + VPValue *getStride() const { return getOperand(1); } + + /// Return the VF operand. + VPValue *getVF() const { return getOperand(2); } + + /// Generate a strided load. + void execute(VPTransformState &State) override; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print the recipe. + void print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const override; +#endif + + /// Returns true if the recipe only uses the first lane of operand \p Op. + bool onlyFirstLaneUsed(const VPValue *Op) const override { + assert(is_contained(operands(), Op) && + "Op must be an operand of the recipe"); + return Op == getAddr() || Op == getStride() || Op == getVF(); + } +}; + /// A recipe for widening store operations, using the stored value, the address /// to store to and an optional mask. struct VPWidenStoreRecipe final : public VPWidenMemoryRecipe { diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp index 92db9674ef42b..714fef032c9b1 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp @@ -184,8 +184,10 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenCallRecipe *R) { } Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPWidenMemoryRecipe *R) { - assert((isa(R)) && - "Store recipes should not define any values"); + assert( + (isa( + R)) && + "Store recipes should not define any values"); return cast(&R->getIngredient())->getType(); } diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ccb7512051d77..adc06d3d4bc16 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -80,6 +80,7 @@ bool VPRecipeBase::mayWriteToMemory() const { case VPWidenCastSC: case VPWidenGEPSC: case VPWidenIntOrFpInductionSC: + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenPHISC: @@ -103,6 +104,7 @@ bool VPRecipeBase::mayReadFromMemory() const { return cast(this)->mayReadOrWriteMemory(); case VPInstructionSC: return cast(this)->opcodeMayReadOrWriteFromMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: return true; @@ -184,6 +186,7 @@ bool VPRecipeBase::mayHaveSideEffects() const { } case VPInterleaveSC: return mayWriteToMemory(); + case VPWidenStridedLoadSC: case VPWidenLoadEVLSC: case VPWidenLoadSC: case VPWidenStoreEVLSC: @@ -2379,13 +2382,22 @@ void VPVectorEndPointerRecipe::print(raw_ostream &O, const Twine &Indent, void VPVectorPointerRecipe::execute(VPTransformState &State) { auto &Builder = State.Builder; unsigned CurrentPart = getUnrollPart(*this); - Type *IndexTy = getGEPIndexTy(State.VF.isScalable(), /*IsReverse*/ false, - /*IsUnitStride*/ true, CurrentPart, Builder); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + + auto *StrideC = dyn_cast(Stride); + bool IsStrideOne = StrideC && StrideC->isOne(); + bool IsUnitStride = IsStrideOne || (StrideC && StrideC->isMinusOne()); + Type *IndexTy = + getGEPIndexTy(State.VF.isScalable(), + /*IsReverse*/ false, IsUnitStride, CurrentPart, Builder); Value *Ptr = State.get(getOperand(0), VPLane(0)); + Stride = Builder.CreateSExtOrTrunc(Stride, IndexTy); Value *Increment = createStepForVF(Builder, IndexTy, State.VF, CurrentPart); + Value *Index = IsStrideOne ? Increment : Builder.CreateMul(Increment, Stride); + Value *ResultPtr = - Builder.CreateGEP(IndexedTy, Ptr, Increment, "", getGEPNoWrapFlags()); + Builder.CreateGEP(IndexedTy, Ptr, Index, "", getGEPNoWrapFlags()); State.set(this, ResultPtr, /*IsScalar*/ true); } @@ -3063,9 +3075,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, getLoadStoreAlignment(const_cast(&Ingredient)); unsigned AS = cast(Ctx.Types.inferScalarType(getAddr())) ->getAddressSpace(); - unsigned Opcode = isa(this) - ? Instruction::Load - : Instruction::Store; + unsigned Opcode = + isa( + this) + ? Instruction::Load + : Instruction::Store; if (!Consecutive) { // TODO: Using the original IR may not be accurate. @@ -3074,6 +3088,11 @@ InstructionCost VPWidenMemoryRecipe::computeCost(ElementCount VF, const Value *Ptr = getLoadStorePointerOperand(&Ingredient); assert(!Reverse && "Inconsecutive memory access should not have the order."); + + if (isa(this)) + return Ctx.TTI.getStridedMemoryOpCost( + Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); + return Ctx.TTI.getAddressComputationCost(Ty) + Ctx.TTI.getGatherScatterOpCost(Opcode, Ty, Ptr, IsMasked, Alignment, Ctx.CostKind, &Ingredient); @@ -3224,6 +3243,50 @@ void VPWidenLoadEVLRecipe::print(raw_ostream &O, const Twine &Indent, } #endif +void VPWidenStridedLoadRecipe::execute(VPTransformState &State) { + Type *ScalarDataTy = getLoadStoreType(&Ingredient); + auto *DataTy = VectorType::get(ScalarDataTy, State.VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + + auto &Builder = State.Builder; + Value *Addr = State.get(getAddr(), /*IsScalar*/ true); + Value *Stride = State.get(getStride(), /*IsScalar*/ true); + Value *Mask = nullptr; + if (VPValue *VPMask = getMask()) + Mask = State.get(VPMask); + else + Mask = Builder.CreateVectorSplat(State.VF, Builder.getTrue()); + Value *RunTimeVF = Builder.CreateZExtOrTrunc(State.get(getVF(), VPLane(0)), + Builder.getInt32Ty()); + + auto *PtrTy = Addr->getType(); + auto *StrideTy = Stride->getType(); + const DataLayout &DL = Ingredient.getDataLayout(); + Value *StrideInBytes = Builder.CreateMul( + Stride, ConstantInt::get(StrideTy, DL.getTypeAllocSize(ScalarDataTy))); + CallInst *NewLI = Builder.CreateIntrinsic( + Intrinsic::experimental_vp_strided_load, {DataTy, PtrTy, StrideTy}, + {Addr, StrideInBytes, Mask, RunTimeVF}, nullptr, "wide.strided.load"); + NewLI->addParamAttr( + 0, Attribute::getWithAlignment(NewLI->getContext(), Alignment)); + applyMetadata(*NewLI); + State.set(this, NewLI); +} + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPWidenStridedLoadRecipe::print(raw_ostream &O, const Twine &Indent, + VPSlotTracker &SlotTracker) const { + O << Indent << "WIDEN "; + printAsOperand(O, SlotTracker); + O << " = load "; + getAddr()->printAsOperand(O, SlotTracker); + O << ", stride = "; + getStride()->printAsOperand(O, SlotTracker); + O << ", runtimeVF = "; + getVF()->printAsOperand(O, SlotTracker); +} +#endif + void VPWidenStoreRecipe::execute(VPTransformState &State) { VPValue *StoredVPValue = getStoredValue(); bool CreateScatter = !isConsecutive(); diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index 90137b72c83fb..645183fde3d7d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -2176,10 +2176,12 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { VPRegionBlock *LoopRegion = Plan.getVectorLoopRegion(); VPBasicBlock *Header = LoopRegion->getEntryBasicBlock(); - assert(all_of(Plan.getVF().users(), - IsaPred) && - "User of VF that we can't transform to EVL."); + assert( + all_of( + Plan.getVF().users(), + IsaPred) && + "User of VF that we can't transform to EVL."); Plan.getVF().replaceAllUsesWith(&EVL); // Defer erasing recipes till the end so that we don't invalidate the @@ -2242,7 +2244,8 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) { NumDefVal <= 1 && "Only supports recipes with a single definition or without users."); EVLRecipe->insertBefore(CurRecipe); - if (isa(EVLRecipe)) { + if (isa(EVLRecipe)) { VPValue *CurVPV = CurRecipe->getVPSingleValue(); CurVPV->replaceAllUsesWith(EVLRecipe->getVPSingleValue()); } @@ -2678,6 +2681,181 @@ void VPlanTransforms::dissolveLoopRegions(VPlan &Plan) { R->dissolveToCFGLoop(); } +static std::pair matchStridedStart(VPValue *CurIndex) { + if (auto *WidenIV = dyn_cast(CurIndex)) + return {WidenIV, WidenIV->getStepValue()}; + + auto *WidenR = dyn_cast(CurIndex); + if (!WidenR || !CurIndex->getUnderlyingValue()) + return {nullptr, nullptr}; + + unsigned Opcode = WidenR->getOpcode(); + // TODO: Support Instruction::Add and Instruction::Or. + if (Opcode != Instruction::Shl && Opcode != Instruction::Mul) + return {nullptr, nullptr}; + + // Match the pattern binop(variant, invariant), or binop(invariant, variant) + // if the binary operator is commutative. + bool IsLHSUniform = vputils::isSingleScalar(WidenR->getOperand(0)); + if (IsLHSUniform == vputils::isSingleScalar(WidenR->getOperand(1)) || + (IsLHSUniform && !Instruction::isCommutative(Opcode))) + return {nullptr, nullptr}; + unsigned VarIdx = IsLHSUniform ? 1 : 0; + + auto [Start, Stride] = matchStridedStart(WidenR->getOperand(VarIdx)); + if (!Start) + return {nullptr, nullptr}; + + SmallVector StartOps(WidenR->operands()); + StartOps[VarIdx] = Start; + auto *StartR = new VPReplicateRecipe(WidenR->getUnderlyingInstr(), StartOps, + /*IsUniform*/ true); + StartR->insertBefore(WidenR); + + unsigned InvIdx = VarIdx == 0 ? 1 : 0; + auto *StrideR = + new VPInstruction(Opcode, {Stride, WidenR->getOperand(InvIdx)}); + StrideR->insertBefore(WidenR); + return {StartR, StrideR}; +} + +static std::pair +determineBaseAndStride(VPWidenGEPRecipe *WidenGEP) { + // Not considered strided if both base pointer and all indices are + // loop-invariant. + if (WidenGEP->areAllOperandsInvariant()) + return {nullptr, nullptr}; + + // TODO: Check if the base pointer is strided. + if (!WidenGEP->isPointerLoopInvariant()) + return {nullptr, nullptr}; + + // Find the only one variant index. + unsigned VarOp = 0; + for (unsigned I = 1, E = WidenGEP->getNumOperands(); I < E; I++) { + if (WidenGEP->isIndexLoopInvariant(I - 1)) + continue; + + if (VarOp != 0) + return {nullptr, nullptr}; + VarOp = I; + } + + if (VarOp == 0) + return {nullptr, nullptr}; + + // TODO: Support cases with a variant index in the middle. + if (VarOp != WidenGEP->getNumOperands() - 1) + return {nullptr, nullptr}; + + VPValue *VarIndex = WidenGEP->getOperand(VarOp); + auto [Start, Stride] = matchStridedStart(VarIndex); + if (!Start) + return {nullptr, nullptr}; + + SmallVector Ops(WidenGEP->operands()); + Ops[VarOp] = Start; + auto *BasePtr = new VPReplicateRecipe(WidenGEP->getUnderlyingInstr(), Ops, + /*IsUniform*/ true); + BasePtr->insertBefore(WidenGEP); + + return {BasePtr, Stride}; +} + +void VPlanTransforms::convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range) { + if (Plan.hasScalarVFOnly()) + return; + + DenseMap> StrideCache; + SmallVector ToErase; + SmallPtrSet PossiblyDead; + for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly( + vp_depth_first_shallow(Plan.getVectorLoopRegion()->getEntry()))) { + for (VPRecipeBase &R : make_early_inc_range(*VPBB)) { + auto *MemR = dyn_cast(&R); + // TODO: support strided store + // TODO: support reverse access + // TODO: transform interleave access into multiple strided accesses + if (!MemR || !isa(MemR) || MemR->isConsecutive()) + continue; + + auto *Ptr = dyn_cast(MemR->getAddr()); + if (!Ptr) + continue; + + // Memory cost model requires the pointer operand of memory access + // instruction. + Value *PtrUV = Ptr->getUnderlyingValue(); + if (!PtrUV) + continue; + + // Try to get base and stride here. + VPValue *BasePtr, *Stride; + auto It = StrideCache.find(Ptr); + if (It != StrideCache.end()) + std::tie(BasePtr, Stride) = It->second; + else + std::tie(BasePtr, Stride) = StrideCache[Ptr] = + determineBaseAndStride(Ptr); + + // Skip if the memory acces is not a strided accesses. + if (!BasePtr) { + assert(!Stride); + continue; + } + assert(Stride); + + Instruction &Ingredient = MemR->getIngredient(); + Type *ElementTy = getLoadStoreType(&Ingredient); + + auto IsProfitable = [&](ElementCount VF) -> bool { + Type *DataTy = toVectorTy(ElementTy, VF); + const Align Alignment = getLoadStoreAlignment(&Ingredient); + if (!Ctx.TTI.isLegalStridedLoadStore(DataTy, Alignment)) + return false; + const InstructionCost CurrentCost = MemR->computeCost(VF, Ctx); + const InstructionCost StridedLoadStoreCost = + Ctx.TTI.getStridedMemoryOpCost(Instruction::Load, DataTy, PtrUV, + MemR->isMasked(), Alignment, + Ctx.CostKind, &Ingredient); + return StridedLoadStoreCost < CurrentCost; + }; + + if (!LoopVectorizationPlanner::getDecisionAndClampRange(IsProfitable, + Range)) { + PossiblyDead.insert(BasePtr); + PossiblyDead.insert(Stride); + continue; + } + PossiblyDead.insert(Ptr); + + // Create a new vector pointer for strided access. + auto *GEP = dyn_cast(PtrUV->stripPointerCasts()); + auto *NewPtr = new VPVectorPointerRecipe(BasePtr, ElementTy, Stride, + GEP ? GEP->getNoWrapFlags() + : GEPNoWrapFlags::none(), + Ptr->getDebugLoc()); + NewPtr->insertBefore(MemR); + + auto *LoadR = cast(MemR); + auto *StridedLoad = new VPWidenStridedLoadRecipe( + *cast(&Ingredient), NewPtr, Stride, &Plan.getVF(), + LoadR->getMask(), *LoadR, LoadR->getDebugLoc()); + StridedLoad->insertBefore(LoadR); + LoadR->replaceAllUsesWith(StridedLoad); + + ToErase.push_back(LoadR); + } + } + + // Clean up dead memory access recipes, and unused base address and stride. + for (auto *R : ToErase) + R->eraseFromParent(); + for (auto *V : PossiblyDead) + recursivelyDeleteDeadRecipes(V); +} + void VPlanTransforms::convertToConcreteRecipes(VPlan &Plan, Type &CanonicalIVTy) { using namespace llvm::VPlanPatternMatch; diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h index 8d2eded45da22..6043d06c7d79e 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h @@ -175,6 +175,12 @@ struct VPlanTransforms { &InterleaveGroups, VPRecipeBuilder &RecipeBuilder, const bool &ScalarEpilogueAllowed); + /// Transform widen memory recipes into strided access recipes when legal + /// and profitable. Clamps \p Range to maintain consistency with widen + /// decisions of \p Plan, and uses \p Ctx to evaluate the cost. + static void convertToStridedAccesses(VPlan &Plan, VPCostContext &Ctx, + VFRange &Range); + /// Remove dead recipes from \p Plan. static void removeDeadRecipes(VPlan &Plan); diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 279cdac92d2d1..5dfdcf1522465 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -347,6 +347,7 @@ class VPDef { VPWidenCastSC, VPWidenGEPSC, VPWidenIntrinsicSC, + VPWidenStridedLoadSC, VPWidenLoadEVLSC, VPWidenLoadSC, VPWidenStoreEVLSC, diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 38ada33d7ee19..bc9d40834c185 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -157,7 +157,7 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const { return VerifyEVLUse(*S, S->getNumOperands() - 1); }) .Case( + VPWidenIntOrFpInductionRecipe, VPWidenStridedLoadRecipe>( [&](const VPRecipeBase *S) { return VerifyEVLUse(*S, 2); }) .Case([&](auto *R) { if (R->getNumOperands() != 3) { diff --git a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll index 528f2448616e8..2c757021e76ff 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-dot-printing.ll @@ -42,11 +42,11 @@ define void @print_call_and_memory(i64 %n, ptr noalias %y, ptr noalias %x) nounw ; CHECK-NEXT: " EMIT vp\<[[CAN_IV:%.+]]\> = CANONICAL-INDUCTION ir\<0\>, vp\<[[CAN_IV_NEXT:%.+]]\>\l" + ; CHECK-NEXT: " vp\<[[STEPS:%.+]]\> = SCALAR-STEPS vp\<[[CAN_IV]]\>, ir\<1\>, vp\<[[VF]]\>\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx\> = getelementptr inbounds ir\<%y\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR:%.+]]\> = vector-pointer ir\<%arrayidx\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN ir\<%lv\> = load vp\<[[VEC_PTR]]\>\l" + ; CHECK-NEXT: " WIDEN-INTRINSIC ir\<%call\> = call llvm.sqrt(ir\<%lv\>)\l" + ; CHECK-NEXT: " CLONE ir\<%arrayidx2\> = getelementptr inbounds ir\<%x\>, vp\<[[STEPS]]\>\l" + -; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>\l" + +; CHECK-NEXT: " vp\<[[VEC_PTR2:%.+]]\> = vector-pointer ir\<%arrayidx2\>, ir\<1\>\l" + ; CHECK-NEXT: " WIDEN store vp\<[[VEC_PTR2]]\>, ir\<%call\>\l" + ; CHECK-NEXT: " EMIT vp\<[[CAN_IV_NEXT]]\> = add nuw vp\<[[CAN_IV]]\>, vp\<[[VFxUF]]\>\l" + ; CHECK-NEXT: " EMIT branch-on-count vp\<[[CAN_IV_NEXT]]\>, vp\<[[VEC_TC]]\>\l" +