diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index bc44ec11edb7b..fc10a518d39ef 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -40,6 +40,7 @@ class OptimizationRemarkEmitter; class TargetTransformInfo; class TargetLibraryInfo; class VPRecipeBuilder; +struct VFRange; /// VPlan-based builder utility analogous to IRBuilder. class VPBuilder { diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index a18a1e3a5a030..f88c13525421c 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -59,6 +59,7 @@ #include "VPlan.h" #include "VPlanAnalysis.h" #include "VPlanHCFGBuilder.h" +#include "VPlanHelpers.h" #include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h index fdaa63b0ba4f3..e81247c98568b 100644 --- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h +++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h @@ -23,6 +23,7 @@ class LoopVectorizationCostModel; class TargetLibraryInfo; class TargetTransformInfo; struct HistogramInfo; +struct VFRange; /// A chain of instructions that form a partial reduction. /// Designed to match: reduction_bin_op (bin_op (extend (A), (extend (B))), diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp index 4a1512abe4e48..5a88ebeffb18b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp @@ -19,6 +19,7 @@ #include "VPlan.h" #include "LoopVectorizationPlanner.h" #include "VPlanCFG.h" +#include "VPlanHelpers.h" #include "VPlanPatternMatch.h" #include "VPlanTransforms.h" #include "VPlanUtils.h" @@ -400,8 +401,8 @@ void VPTransformState::packScalarIntoVectorValue(VPValue *Def, set(Def, VectorValue); } -BasicBlock * -VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { +BasicBlock *VPBasicBlock::createEmptyBasicBlock(VPTransformState &State) { + auto &CFG = State.CFG; // BB stands for IR BasicBlocks. VPBB stands for VPlan VPBasicBlocks. // Pred stands for Predessor. Prev stands for Previous - last visited/created. BasicBlock *PrevBB = CFG.PrevBB; @@ -412,7 +413,8 @@ VPBasicBlock::createEmptyBasicBlock(VPTransformState::CFGState &CFG) { return NewBB; } -void VPBasicBlock::connectToPredecessors(VPTransformState::CFGState &CFG) { +void VPBasicBlock::connectToPredecessors(VPTransformState &State) { + auto &CFG = State.CFG; BasicBlock *NewBB = CFG.VPBB2IRBB[this]; // Hook up the new basic block to its predecessors. for (VPBlockBase *PredVPBlock : getHierarchicalPredecessors()) { @@ -467,7 +469,7 @@ void VPIRBasicBlock::execute(VPTransformState *State) { "other blocks must be terminated by a branch"); } - connectToPredecessors(State->CFG); + connectToPredecessors(*State); } VPIRBasicBlock *VPIRBasicBlock::clone() { @@ -494,7 +496,7 @@ void VPBasicBlock::execute(VPTransformState *State) { // * the exit of a replicate region. State->CFG.VPBB2IRBB[this] = NewBB; } else { - NewBB = createEmptyBasicBlock(State->CFG); + NewBB = createEmptyBasicBlock(*State); State->Builder.SetInsertPoint(NewBB); // Temporarily terminate with unreachable until CFG is rewired. @@ -514,7 +516,7 @@ void VPBasicBlock::execute(VPTransformState *State) { State->CFG.PrevBB = NewBB; State->CFG.VPBB2IRBB[this] = NewBB; - connectToPredecessors(State->CFG); + connectToPredecessors(*State); } // 2. Fill the IR basic block with IR instructions. @@ -623,6 +625,11 @@ bool VPBasicBlock::isExiting() const { } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +void VPBlockBase::print(raw_ostream &O) const { + VPSlotTracker SlotTracker(getPlan()); + print(O, "", SlotTracker); +} + void VPBlockBase::printSuccessors(raw_ostream &O, const Twine &Indent) const { if (getSuccessors().empty()) { O << Indent << "No successors\n"; @@ -1471,58 +1478,6 @@ void VPUser::printOperands(raw_ostream &O, VPSlotTracker &SlotTracker) const { } #endif -void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, - Old2NewTy &Old2New, - InterleavedAccessInfo &IAI) { - ReversePostOrderTraversal> - RPOT(Region->getEntry()); - for (VPBlockBase *Base : RPOT) { - visitBlock(Base, Old2New, IAI); - } -} - -void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, - InterleavedAccessInfo &IAI) { - if (VPBasicBlock *VPBB = dyn_cast(Block)) { - for (VPRecipeBase &VPI : *VPBB) { - if (isa(&VPI)) - continue; - assert(isa(&VPI) && "Can only handle VPInstructions"); - auto *VPInst = cast(&VPI); - - auto *Inst = dyn_cast_or_null(VPInst->getUnderlyingValue()); - if (!Inst) - continue; - auto *IG = IAI.getInterleaveGroup(Inst); - if (!IG) - continue; - - auto NewIGIter = Old2New.find(IG); - if (NewIGIter == Old2New.end()) - Old2New[IG] = new InterleaveGroup( - IG->getFactor(), IG->isReverse(), IG->getAlign()); - - if (Inst == IG->getInsertPos()) - Old2New[IG]->setInsertPos(VPInst); - - InterleaveGroupMap[VPInst] = Old2New[IG]; - InterleaveGroupMap[VPInst]->insertMember( - VPInst, IG->getIndex(Inst), - Align(IG->isReverse() ? (-1) * int(IG->getFactor()) - : IG->getFactor())); - } - } else if (VPRegionBlock *Region = dyn_cast(Block)) - visitRegion(Region, Old2New, IAI); - else - llvm_unreachable("Unsupported kind of VPBlock."); -} - -VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, - InterleavedAccessInfo &IAI) { - Old2NewTy Old2New; - visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI); -} - void VPSlotTracker::assignName(const VPValue *V) { assert(!VPValue2Name.contains(V) && "VPValue already has a name!"); auto *UV = V->getUnderlyingValue(); diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index 87ef2dc6d8eae..fac207287e0bc 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -17,7 +17,6 @@ /// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned /// instruction; /// 5. The VPlan class holding a candidate for vectorization; -/// 6. The VPlanPrinter class providing a way to print a plan in dot format; /// These are documented in docs/VectorizationPlan.rst. // //===----------------------------------------------------------------------===// @@ -34,10 +33,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist.h" #include "llvm/ADT/ilist_node.h" -#include "llvm/Analysis/DomTreeUpdater.h" #include "llvm/Analysis/IVDescriptors.h" -#include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/DebugLoc.h" #include "llvm/IR/FMF.h" @@ -54,7 +50,7 @@ class BasicBlock; class DominatorTree; class InnerLoopVectorizer; class IRBuilderBase; -class LoopInfo; +struct VPTransformState; class raw_ostream; class RecurrenceDescriptor; class SCEV; @@ -63,11 +59,11 @@ class VPBasicBlock; class VPBuilder; class VPRegionBlock; class VPlan; +class VPLane; class VPReplicateRecipe; class VPlanSlp; class Value; class LoopVectorizationCostModel; -class LoopVersioning; struct VPCostContext; @@ -75,318 +71,8 @@ namespace Intrinsic { typedef unsigned ID; } -/// Returns a calculation for the total number of elements for a given \p VF. -/// For fixed width vectors this value is a constant, whereas for scalable -/// vectors it is an expression determined at runtime. -Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); - -/// Return a value for Step multiplied by VF. -Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, - int64_t Step); - -/// A helper function that returns the reciprocal of the block probability of -/// predicated blocks. If we return X, we are assuming the predicated block -/// will execute once for every X iterations of the loop header. -/// -/// TODO: We should use actual block probability here, if available. Currently, -/// we always assume predicated blocks have a 50% chance of executing. -inline unsigned getReciprocalPredBlockProb() { return 2; } - -/// A range of powers-of-2 vectorization factors with fixed start and -/// adjustable end. The range includes start and excludes end, e.g.,: -/// [1, 16) = {1, 2, 4, 8} -struct VFRange { - // A power of 2. - const ElementCount Start; - - // A power of 2. If End <= Start range is empty. - ElementCount End; - - bool isEmpty() const { - return End.getKnownMinValue() <= Start.getKnownMinValue(); - } - - VFRange(const ElementCount &Start, const ElementCount &End) - : Start(Start), End(End) { - assert(Start.isScalable() == End.isScalable() && - "Both Start and End should have the same scalable flag"); - assert(isPowerOf2_32(Start.getKnownMinValue()) && - "Expected Start to be a power of 2"); - assert(isPowerOf2_32(End.getKnownMinValue()) && - "Expected End to be a power of 2"); - } - - /// Iterator to iterate over vectorization factors in a VFRange. - class iterator - : public iterator_facade_base { - ElementCount VF; - - public: - iterator(ElementCount VF) : VF(VF) {} - - bool operator==(const iterator &Other) const { return VF == Other.VF; } - - ElementCount operator*() const { return VF; } - - iterator &operator++() { - VF *= 2; - return *this; - } - }; - - iterator begin() { return iterator(Start); } - iterator end() { - assert(isPowerOf2_32(End.getKnownMinValue())); - return iterator(End); - } -}; - using VPlanPtr = std::unique_ptr; -/// In what follows, the term "input IR" refers to code that is fed into the -/// vectorizer whereas the term "output IR" refers to code that is generated by -/// the vectorizer. - -/// VPLane provides a way to access lanes in both fixed width and scalable -/// vectors, where for the latter the lane index sometimes needs calculating -/// as a runtime expression. -class VPLane { -public: - /// Kind describes how to interpret Lane. - enum class Kind : uint8_t { - /// For First, Lane is the index into the first N elements of a - /// fixed-vector > or a scalable vector >. - First, - /// For ScalableLast, Lane is the offset from the start of the last - /// N-element subvector in a scalable vector >. For - /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of - /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. - ScalableLast - }; - -private: - /// in [0..VF) - unsigned Lane; - - /// Indicates how the Lane should be interpreted, as described above. - Kind LaneKind; - -public: - VPLane(unsigned Lane) : Lane(Lane), LaneKind(VPLane::Kind::First) {} - VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} - - static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } - - static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) { - assert(Offset > 0 && Offset <= VF.getKnownMinValue() && - "trying to extract with invalid offset"); - unsigned LaneOffset = VF.getKnownMinValue() - Offset; - Kind LaneKind; - if (VF.isScalable()) - // In this case 'LaneOffset' refers to the offset from the start of the - // last subvector with VF.getKnownMinValue() elements. - LaneKind = VPLane::Kind::ScalableLast; - else - LaneKind = VPLane::Kind::First; - return VPLane(LaneOffset, LaneKind); - } - - static VPLane getLastLaneForVF(const ElementCount &VF) { - return getLaneFromEnd(VF, 1); - } - - /// Returns a compile-time known value for the lane index and asserts if the - /// lane can only be calculated at runtime. - unsigned getKnownLane() const { - assert(LaneKind == Kind::First); - return Lane; - } - - /// Returns an expression describing the lane index that can be used at - /// runtime. - Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const; - - /// Returns the Kind of lane offset. - Kind getKind() const { return LaneKind; } - - /// Returns true if this is the first lane of the whole vector. - bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } - - /// Maps the lane to a cache index based on \p VF. - unsigned mapToCacheIndex(const ElementCount &VF) const { - switch (LaneKind) { - case VPLane::Kind::ScalableLast: - assert(VF.isScalable() && Lane < VF.getKnownMinValue()); - return VF.getKnownMinValue() + Lane; - default: - assert(Lane < VF.getKnownMinValue()); - return Lane; - } - } -}; - -/// VPTransformState holds information passed down when "executing" a VPlan, -/// needed for generating the output IR. -struct VPTransformState { - VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, - LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, - InnerLoopVectorizer *ILV, VPlan *Plan, - Loop *CurrentParentLoop, Type *CanonicalIVTy); - /// Target Transform Info. - const TargetTransformInfo *TTI; - - /// The chosen Vectorization Factor of the loop being vectorized. - ElementCount VF; - - /// Hold the index to generate specific scalar instructions. Null indicates - /// that all instances are to be generated, using either scalar or vector - /// instructions. - std::optional Lane; - - struct DataState { - // Each value from the original loop, when vectorized, is represented by a - // vector value in the map. - DenseMap VPV2Vector; - - DenseMap> VPV2Scalars; - } Data; - - /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar - /// is false, otherwise return the generated scalar. \See set. - Value *get(VPValue *Def, bool IsScalar = false); - - /// Get the generated Value for a given VPValue and given Part and Lane. - Value *get(VPValue *Def, const VPLane &Lane); - - bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); } - - bool hasScalarValue(VPValue *Def, VPLane Lane) { - auto I = Data.VPV2Scalars.find(Def); - if (I == Data.VPV2Scalars.end()) - return false; - unsigned CacheIdx = Lane.mapToCacheIndex(VF); - return CacheIdx < I->second.size() && I->second[CacheIdx]; - } - - /// Set the generated vector Value for a given VPValue, if \p - /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0. - void set(VPValue *Def, Value *V, bool IsScalar = false) { - if (IsScalar) { - set(Def, V, VPLane(0)); - return; - } - assert((VF.isScalar() || V->getType()->isVectorTy()) && - "scalar values must be stored as (0, 0)"); - Data.VPV2Vector[Def] = V; - } - - /// Reset an existing vector value for \p Def and a given \p Part. - void reset(VPValue *Def, Value *V) { - assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value"); - Data.VPV2Vector[Def] = V; - } - - /// Set the generated scalar \p V for \p Def and the given \p Lane. - void set(VPValue *Def, Value *V, const VPLane &Lane) { - auto &Scalars = Data.VPV2Scalars[Def]; - unsigned CacheIdx = Lane.mapToCacheIndex(VF); - if (Scalars.size() <= CacheIdx) - Scalars.resize(CacheIdx + 1); - assert(!Scalars[CacheIdx] && "should overwrite existing value"); - Scalars[CacheIdx] = V; - } - - /// Reset an existing scalar value for \p Def and a given \p Lane. - void reset(VPValue *Def, Value *V, const VPLane &Lane) { - auto Iter = Data.VPV2Scalars.find(Def); - assert(Iter != Data.VPV2Scalars.end() && - "need to overwrite existing value"); - unsigned CacheIdx = Lane.mapToCacheIndex(VF); - assert(CacheIdx < Iter->second.size() && - "need to overwrite existing value"); - Iter->second[CacheIdx] = V; - } - - /// Add additional metadata to \p To that was not present on \p Orig. - /// - /// Currently this is used to add the noalias annotations based on the - /// inserted memchecks. Use this for instructions that are *cloned* into the - /// vector loop. - void addNewMetadata(Instruction *To, const Instruction *Orig); - - /// Add metadata from one instruction to another. - /// - /// This includes both the original MDs from \p From and additional ones (\see - /// addNewMetadata). Use this for *newly created* instructions in the vector - /// loop. - void addMetadata(Value *To, Instruction *From); - - /// Set the debug location in the builder using the debug location \p DL. - void setDebugLocFrom(DebugLoc DL); - - /// Construct the vector value of a scalarized value \p V one lane at a time. - void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane); - - /// Hold state information used when constructing the CFG of the output IR, - /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. - struct CFGState { - /// The previous VPBasicBlock visited. Initially set to null. - VPBasicBlock *PrevVPBB = nullptr; - - /// The previous IR BasicBlock created or used. Initially set to the new - /// header BasicBlock. - BasicBlock *PrevBB = nullptr; - - /// The last IR BasicBlock in the output IR. Set to the exit block of the - /// vector loop. - BasicBlock *ExitBB = nullptr; - - /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case - /// of replication, maps the BasicBlock of the last replica created. - SmallDenseMap VPBB2IRBB; - - /// Updater for the DominatorTree. - DomTreeUpdater DTU; - - CFGState(DominatorTree *DT) - : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {} - - /// Returns the BasicBlock* mapped to the pre-header of the loop region - /// containing \p R. - BasicBlock *getPreheaderBBFor(VPRecipeBase *R); - } CFG; - - /// Hold a pointer to LoopInfo to register new basic blocks in the loop. - LoopInfo *LI; - - /// Hold a reference to the IRBuilder used to generate output IR code. - IRBuilderBase &Builder; - - /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. - InnerLoopVectorizer *ILV; - - /// Pointer to the VPlan code is generated for. - VPlan *Plan; - - /// The parent loop object for the current scope, or nullptr. - Loop *CurrentParentLoop = nullptr; - - /// LoopVersioning. It's only set up (non-null) if memchecks were - /// used. - /// - /// This is currently only used to add no-alias metadata based on the - /// memchecks. The actually versioning is performed manually. - LoopVersioning *LVer = nullptr; - - /// Map SCEVs to their expanded values. Populated when executing - /// VPExpandSCEVRecipes. - DenseMap ExpandedSCEVs; - - /// VPlan-based type analysis. - VPTypeAnalysis TypeAnalysis; -}; - /// VPBlockBase is the building block of the Hierarchical Control-Flow Graph. /// A VPBlockBase can be either a VPBasicBlock or a VPRegionBlock. class VPBlockBase { @@ -654,10 +340,7 @@ class VPBlockBase { VPSlotTracker &SlotTracker) const = 0; /// Print plain-text dump of this VPlan to \p O. - void print(raw_ostream &O) const { - VPSlotTracker SlotTracker(getPlan()); - print(O, "", SlotTracker); - } + void print(raw_ostream &O) const; /// Print the successors of this block to \p O, prefixing all lines with \p /// Indent. @@ -673,34 +356,6 @@ class VPBlockBase { virtual VPBlockBase *clone() = 0; }; -/// Struct to hold various analysis needed for cost computations. -struct VPCostContext { - const TargetTransformInfo &TTI; - const TargetLibraryInfo &TLI; - VPTypeAnalysis Types; - LLVMContext &LLVMCtx; - LoopVectorizationCostModel &CM; - SmallPtrSet SkipCostComputation; - TargetTransformInfo::TargetCostKind CostKind; - - VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, - Type *CanIVTy, LoopVectorizationCostModel &CM, - TargetTransformInfo::TargetCostKind CostKind) - : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()), - CM(CM), CostKind(CostKind) {} - - /// Return the cost for \p UI with \p VF using the legacy cost model as - /// fallback until computing the cost of all recipes migrates to VPlan. - InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const; - - /// Return true if the cost for \p UI shouldn't be computed, e.g. because it - /// has already been pre-computed. - bool skipCostComputation(Instruction *UI, bool IsVector) const; - - /// Returns the OperandInfo for \p V, if it is a live-in. - TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const; -}; - /// VPRecipeBase is a base class modeling a sequence of one or more output IR /// instructions. VPRecipeBase owns the VPValues it defines through VPDef /// and is responsible for deleting its defined values. Single-value @@ -3671,12 +3326,12 @@ class VPBasicBlock : public VPBlockBase { /// Connect the VPBBs predecessors' in the VPlan CFG to the IR basic block /// generated for this VPBB. - void connectToPredecessors(VPTransformState::CFGState &CFG); + void connectToPredecessors(VPTransformState &State); private: /// Create an IR BasicBlock to hold the output instructions generated by this /// VPBasicBlock, and return it. Update the CFGState accordingly. - BasicBlock *createEmptyBasicBlock(VPTransformState::CFGState &CFG); + BasicBlock *createEmptyBasicBlock(VPTransformState &State); }; /// A special type of VPBasicBlock that wraps an existing IR basic block. @@ -4146,55 +3801,6 @@ class VPlan { }; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) -/// VPlanPrinter prints a given VPlan to a given output stream. The printing is -/// indented and follows the dot format. -class VPlanPrinter { - raw_ostream &OS; - const VPlan &Plan; - unsigned Depth = 0; - unsigned TabWidth = 2; - std::string Indent; - unsigned BID = 0; - SmallDenseMap BlockID; - - VPSlotTracker SlotTracker; - - /// Handle indentation. - void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } - - /// Print a given \p Block of the Plan. - void dumpBlock(const VPBlockBase *Block); - - /// Print the information related to the CFG edges going out of a given - /// \p Block, followed by printing the successor blocks themselves. - void dumpEdges(const VPBlockBase *Block); - - /// Print a given \p BasicBlock, including its VPRecipes, followed by printing - /// its successor blocks. - void dumpBasicBlock(const VPBasicBlock *BasicBlock); - - /// Print a given \p Region of the Plan. - void dumpRegion(const VPRegionBlock *Region); - - unsigned getOrCreateBID(const VPBlockBase *Block) { - return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; - } - - Twine getOrCreateName(const VPBlockBase *Block); - - Twine getUID(const VPBlockBase *Block); - - /// Print the information related to a CFG edge between two VPBlockBases. - void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, - const Twine &Label); - -public: - VPlanPrinter(raw_ostream &O, const VPlan &P) - : OS(O), Plan(P), SlotTracker(&P) {} - - LLVM_DUMP_METHOD void dump(); -}; - struct VPlanIngredient { const Value *V; @@ -4214,139 +3820,6 @@ inline raw_ostream &operator<<(raw_ostream &OS, const VPlan &Plan) { } #endif -class VPInterleavedAccessInfo { - DenseMap *> - InterleaveGroupMap; - - /// Type for mapping of instruction based interleave groups to VPInstruction - /// interleave groups - using Old2NewTy = DenseMap *, - InterleaveGroup *>; - - /// Recursively \p Region and populate VPlan based interleave groups based on - /// \p IAI. - void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, - InterleavedAccessInfo &IAI); - /// Recursively traverse \p Block and populate VPlan based interleave groups - /// based on \p IAI. - void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, - InterleavedAccessInfo &IAI); - -public: - VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI); - - ~VPInterleavedAccessInfo() { - SmallPtrSet *, 4> DelSet; - // Avoid releasing a pointer twice. - for (auto &I : InterleaveGroupMap) - DelSet.insert(I.second); - for (auto *Ptr : DelSet) - delete Ptr; - } - - /// Get the interleave group that \p Instr belongs to. - /// - /// \returns nullptr if doesn't have such group. - InterleaveGroup * - getInterleaveGroup(VPInstruction *Instr) const { - return InterleaveGroupMap.lookup(Instr); - } -}; - -/// Class that maps (parts of) an existing VPlan to trees of combined -/// VPInstructions. -class VPlanSlp { - enum class OpMode { Failed, Load, Opcode }; - - /// A DenseMapInfo implementation for using SmallVector as - /// DenseMap keys. - struct BundleDenseMapInfo { - static SmallVector getEmptyKey() { - return {reinterpret_cast(-1)}; - } - - static SmallVector getTombstoneKey() { - return {reinterpret_cast(-2)}; - } - - static unsigned getHashValue(const SmallVector &V) { - return static_cast(hash_combine_range(V.begin(), V.end())); - } - - static bool isEqual(const SmallVector &LHS, - const SmallVector &RHS) { - return LHS == RHS; - } - }; - - /// Mapping of values in the original VPlan to a combined VPInstruction. - DenseMap, VPInstruction *, BundleDenseMapInfo> - BundleToCombined; - - VPInterleavedAccessInfo &IAI; - - /// Basic block to operate on. For now, only instructions in a single BB are - /// considered. - const VPBasicBlock &BB; - - /// Indicates whether we managed to combine all visited instructions or not. - bool CompletelySLP = true; - - /// Width of the widest combined bundle in bits. - unsigned WidestBundleBits = 0; - - using MultiNodeOpTy = - typename std::pair>; - - // Input operand bundles for the current multi node. Each multi node operand - // bundle contains values not matching the multi node's opcode. They will - // be reordered in reorderMultiNodeOps, once we completed building a - // multi node. - SmallVector MultiNodeOps; - - /// Indicates whether we are building a multi node currently. - bool MultiNodeActive = false; - - /// Check if we can vectorize Operands together. - bool areVectorizable(ArrayRef Operands) const; - - /// Add combined instruction \p New for the bundle \p Operands. - void addCombined(ArrayRef Operands, VPInstruction *New); - - /// Indicate we hit a bundle we failed to combine. Returns nullptr for now. - VPInstruction *markFailed(); - - /// Reorder operands in the multi node to maximize sequential memory access - /// and commutative operations. - SmallVector reorderMultiNodeOps(); - - /// Choose the best candidate to use for the lane after \p Last. The set of - /// candidates to choose from are values with an opcode matching \p Last's - /// or loads consecutive to \p Last. - std::pair getBest(OpMode Mode, VPValue *Last, - SmallPtrSetImpl &Candidates, - VPInterleavedAccessInfo &IAI); - -#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) - /// Print bundle \p Values to dbgs(). - void dumpBundle(ArrayRef Values); -#endif - -public: - VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} - - ~VPlanSlp() = default; - - /// Tries to build an SLP tree rooted at \p Operands and returns a - /// VPInstruction combining \p Operands, if they can be combined. - VPInstruction *buildGraph(ArrayRef Operands); - - /// Return the width of the widest combined bundle in bits. - unsigned getWidestBundleBits() const { return WidestBundleBits; } - - /// Return true if all visited instruction can be combined. - bool isCompletelySLP() const { return CompletelySLP; } -}; } // end namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanHelpers.h b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h new file mode 100644 index 0000000000000..74713daf904f0 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanHelpers.h @@ -0,0 +1,468 @@ +//===- VPlanHelpers.h - VPlan-related auxiliary helpers -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file contains the declarations of different VPlan-related auxiliary +/// helpers. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANHELPERS_H + +#include "VPlanAnalysis.h" +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/DomTreeUpdater.h" +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/InstructionCost.h" + +namespace llvm { + +class BasicBlock; +class DominatorTree; +class InnerLoopVectorizer; +class IRBuilderBase; +class LoopInfo; +class SCEV; +class Type; +class VPBasicBlock; +class VPRegionBlock; +class VPlan; +class Value; +class LoopVersioning; + +/// Returns a calculation for the total number of elements for a given \p VF. +/// For fixed width vectors this value is a constant, whereas for scalable +/// vectors it is an expression determined at runtime. +Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF); + +/// Return a value for Step multiplied by VF. +Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF, + int64_t Step); + +/// A helper function that returns the reciprocal of the block probability of +/// predicated blocks. If we return X, we are assuming the predicated block +/// will execute once for every X iterations of the loop header. +/// +/// TODO: We should use actual block probability here, if available. Currently, +/// we always assume predicated blocks have a 50% chance of executing. +inline unsigned getReciprocalPredBlockProb() { return 2; } + +/// A range of powers-of-2 vectorization factors with fixed start and +/// adjustable end. The range includes start and excludes end, e.g.,: +/// [1, 16) = {1, 2, 4, 8} +struct VFRange { + // A power of 2. + const ElementCount Start; + + // A power of 2. If End <= Start range is empty. + ElementCount End; + + bool isEmpty() const { + return End.getKnownMinValue() <= Start.getKnownMinValue(); + } + + VFRange(const ElementCount &Start, const ElementCount &End) + : Start(Start), End(End) { + assert(Start.isScalable() == End.isScalable() && + "Both Start and End should have the same scalable flag"); + assert(isPowerOf2_32(Start.getKnownMinValue()) && + "Expected Start to be a power of 2"); + assert(isPowerOf2_32(End.getKnownMinValue()) && + "Expected End to be a power of 2"); + } + + /// Iterator to iterate over vectorization factors in a VFRange. + class iterator + : public iterator_facade_base { + ElementCount VF; + + public: + iterator(ElementCount VF) : VF(VF) {} + + bool operator==(const iterator &Other) const { return VF == Other.VF; } + + ElementCount operator*() const { return VF; } + + iterator &operator++() { + VF *= 2; + return *this; + } + }; + + iterator begin() { return iterator(Start); } + iterator end() { + assert(isPowerOf2_32(End.getKnownMinValue())); + return iterator(End); + } +}; + +/// In what follows, the term "input IR" refers to code that is fed into the +/// vectorizer whereas the term "output IR" refers to code that is generated by +/// the vectorizer. + +/// VPLane provides a way to access lanes in both fixed width and scalable +/// vectors, where for the latter the lane index sometimes needs calculating +/// as a runtime expression. +class VPLane { +public: + /// Kind describes how to interpret Lane. + enum class Kind : uint8_t { + /// For First, Lane is the index into the first N elements of a + /// fixed-vector > or a scalable vector >. + First, + /// For ScalableLast, Lane is the offset from the start of the last + /// N-element subvector in a scalable vector >. For + /// example, a Lane of 0 corresponds to lane `(vscale - 1) * N`, a Lane of + /// 1 corresponds to `((vscale - 1) * N) + 1`, etc. + ScalableLast + }; + +private: + /// in [0..VF) + unsigned Lane; + + /// Indicates how the Lane should be interpreted, as described above. + Kind LaneKind = Kind::First; + +public: + VPLane(unsigned Lane) : Lane(Lane) {} + VPLane(unsigned Lane, Kind LaneKind) : Lane(Lane), LaneKind(LaneKind) {} + + static VPLane getFirstLane() { return VPLane(0, VPLane::Kind::First); } + + static VPLane getLaneFromEnd(const ElementCount &VF, unsigned Offset) { + assert(Offset > 0 && Offset <= VF.getKnownMinValue() && + "trying to extract with invalid offset"); + unsigned LaneOffset = VF.getKnownMinValue() - Offset; + Kind LaneKind; + if (VF.isScalable()) + // In this case 'LaneOffset' refers to the offset from the start of the + // last subvector with VF.getKnownMinValue() elements. + LaneKind = VPLane::Kind::ScalableLast; + else + LaneKind = VPLane::Kind::First; + return VPLane(LaneOffset, LaneKind); + } + + static VPLane getLastLaneForVF(const ElementCount &VF) { + return getLaneFromEnd(VF, 1); + } + + /// Returns a compile-time known value for the lane index and asserts if the + /// lane can only be calculated at runtime. + unsigned getKnownLane() const { + assert(LaneKind == Kind::First && + "can only get known lane from the beginning"); + return Lane; + } + + /// Returns an expression describing the lane index that can be used at + /// runtime. + Value *getAsRuntimeExpr(IRBuilderBase &Builder, const ElementCount &VF) const; + + /// Returns the Kind of lane offset. + Kind getKind() const { return LaneKind; } + + /// Returns true if this is the first lane of the whole vector. + bool isFirstLane() const { return Lane == 0 && LaneKind == Kind::First; } + + /// Maps the lane to a cache index based on \p VF. + unsigned mapToCacheIndex(const ElementCount &VF) const { + switch (LaneKind) { + case VPLane::Kind::ScalableLast: + assert(VF.isScalable() && Lane < VF.getKnownMinValue() && + "ScalableLast can only be used with scalable VFs"); + return VF.getKnownMinValue() + Lane; + default: + assert(Lane < VF.getKnownMinValue() && + "Cannot extract lane larger than VF"); + return Lane; + } + } +}; + +/// VPTransformState holds information passed down when "executing" a VPlan, +/// needed for generating the output IR. +struct VPTransformState { + VPTransformState(const TargetTransformInfo *TTI, ElementCount VF, unsigned UF, + LoopInfo *LI, DominatorTree *DT, IRBuilderBase &Builder, + InnerLoopVectorizer *ILV, VPlan *Plan, + Loop *CurrentParentLoop, Type *CanonicalIVTy); + /// Target Transform Info. + const TargetTransformInfo *TTI; + + /// The chosen Vectorization Factor of the loop being vectorized. + ElementCount VF; + + /// Hold the index to generate specific scalar instructions. Null indicates + /// that all instances are to be generated, using either scalar or vector + /// instructions. + std::optional Lane; + + struct DataState { + // Each value from the original loop, when vectorized, is represented by a + // vector value in the map. + DenseMap VPV2Vector; + + DenseMap> VPV2Scalars; + } Data; + + /// Get the generated vector Value for a given VPValue \p Def if \p IsScalar + /// is false, otherwise return the generated scalar. \See set. + Value *get(VPValue *Def, bool IsScalar = false); + + /// Get the generated Value for a given VPValue and given Part and Lane. + Value *get(VPValue *Def, const VPLane &Lane); + + bool hasVectorValue(VPValue *Def) { return Data.VPV2Vector.contains(Def); } + + bool hasScalarValue(VPValue *Def, VPLane Lane) { + auto I = Data.VPV2Scalars.find(Def); + if (I == Data.VPV2Scalars.end()) + return false; + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + return CacheIdx < I->second.size() && I->second[CacheIdx]; + } + + /// Set the generated vector Value for a given VPValue, if \p + /// IsScalar is false. If \p IsScalar is true, set the scalar in lane 0. + void set(VPValue *Def, Value *V, bool IsScalar = false) { + if (IsScalar) { + set(Def, V, VPLane(0)); + return; + } + assert((VF.isScalar() || V->getType()->isVectorTy()) && + "scalar values must be stored as (0, 0)"); + Data.VPV2Vector[Def] = V; + } + + /// Reset an existing vector value for \p Def and a given \p Part. + void reset(VPValue *Def, Value *V) { + assert(Data.VPV2Vector.contains(Def) && "need to overwrite existing value"); + Data.VPV2Vector[Def] = V; + } + + /// Set the generated scalar \p V for \p Def and the given \p Lane. + void set(VPValue *Def, Value *V, const VPLane &Lane) { + auto &Scalars = Data.VPV2Scalars[Def]; + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + if (Scalars.size() <= CacheIdx) + Scalars.resize(CacheIdx + 1); + assert(!Scalars[CacheIdx] && "should overwrite existing value"); + Scalars[CacheIdx] = V; + } + + /// Reset an existing scalar value for \p Def and a given \p Lane. + void reset(VPValue *Def, Value *V, const VPLane &Lane) { + auto Iter = Data.VPV2Scalars.find(Def); + assert(Iter != Data.VPV2Scalars.end() && + "need to overwrite existing value"); + unsigned CacheIdx = Lane.mapToCacheIndex(VF); + assert(CacheIdx < Iter->second.size() && + "need to overwrite existing value"); + Iter->second[CacheIdx] = V; + } + + /// Add additional metadata to \p To that was not present on \p Orig. + /// + /// Currently this is used to add the noalias annotations based on the + /// inserted memchecks. Use this for instructions that are *cloned* into the + /// vector loop. + void addNewMetadata(Instruction *To, const Instruction *Orig); + + /// Add metadata from one instruction to another. + /// + /// This includes both the original MDs from \p From and additional ones (\see + /// addNewMetadata). Use this for *newly created* instructions in the vector + /// loop. + void addMetadata(Value *To, Instruction *From); + + /// Set the debug location in the builder using the debug location \p DL. + void setDebugLocFrom(DebugLoc DL); + + /// Construct the vector value of a scalarized value \p V one lane at a time. + void packScalarIntoVectorValue(VPValue *Def, const VPLane &Lane); + + /// Hold state information used when constructing the CFG of the output IR, + /// traversing the VPBasicBlocks and generating corresponding IR BasicBlocks. + struct CFGState { + /// The previous VPBasicBlock visited. Initially set to null. + VPBasicBlock *PrevVPBB = nullptr; + + /// The previous IR BasicBlock created or used. Initially set to the new + /// header BasicBlock. + BasicBlock *PrevBB = nullptr; + + /// The last IR BasicBlock in the output IR. Set to the exit block of the + /// vector loop. + BasicBlock *ExitBB = nullptr; + + /// A mapping of each VPBasicBlock to the corresponding BasicBlock. In case + /// of replication, maps the BasicBlock of the last replica created. + SmallDenseMap VPBB2IRBB; + + /// Updater for the DominatorTree. + DomTreeUpdater DTU; + + CFGState(DominatorTree *DT) + : DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy) {} + + /// Returns the BasicBlock* mapped to the pre-header of the loop region + /// containing \p R. + BasicBlock *getPreheaderBBFor(VPRecipeBase *R); + } CFG; + + /// Hold a pointer to LoopInfo to register new basic blocks in the loop. + LoopInfo *LI; + + /// Hold a reference to the IRBuilder used to generate output IR code. + IRBuilderBase &Builder; + + /// Hold a pointer to InnerLoopVectorizer to reuse its IR generation methods. + InnerLoopVectorizer *ILV; + + /// Pointer to the VPlan code is generated for. + VPlan *Plan; + + /// The parent loop object for the current scope, or nullptr. + Loop *CurrentParentLoop = nullptr; + + /// LoopVersioning. It's only set up (non-null) if memchecks were + /// used. + /// + /// This is currently only used to add no-alias metadata based on the + /// memchecks. The actually versioning is performed manually. + LoopVersioning *LVer = nullptr; + + /// Map SCEVs to their expanded values. Populated when executing + /// VPExpandSCEVRecipes. + DenseMap ExpandedSCEVs; + + /// VPlan-based type analysis. + VPTypeAnalysis TypeAnalysis; +}; + +/// Struct to hold various analysis needed for cost computations. +struct VPCostContext { + const TargetTransformInfo &TTI; + const TargetLibraryInfo &TLI; + VPTypeAnalysis Types; + LLVMContext &LLVMCtx; + LoopVectorizationCostModel &CM; + SmallPtrSet SkipCostComputation; + TargetTransformInfo::TargetCostKind CostKind; + + VPCostContext(const TargetTransformInfo &TTI, const TargetLibraryInfo &TLI, + Type *CanIVTy, LoopVectorizationCostModel &CM, + TargetTransformInfo::TargetCostKind CostKind) + : TTI(TTI), TLI(TLI), Types(CanIVTy), LLVMCtx(CanIVTy->getContext()), + CM(CM), CostKind(CostKind) {} + + /// Return the cost for \p UI with \p VF using the legacy cost model as + /// fallback until computing the cost of all recipes migrates to VPlan. + InstructionCost getLegacyCost(Instruction *UI, ElementCount VF) const; + + /// Return true if the cost for \p UI shouldn't be computed, e.g. because it + /// has already been pre-computed. + bool skipCostComputation(Instruction *UI, bool IsVector) const; + + /// Returns the OperandInfo for \p V, if it is a live-in. + TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const; +}; + +/// This class can be used to assign names to VPValues. For VPValues without +/// underlying value, assign consecutive numbers and use those as names (wrapped +/// in vp<>). Otherwise, use the name from the underlying value (wrapped in +/// ir<>), appending a .V version number if there are multiple uses of the same +/// name. Allows querying names for VPValues for printing, similar to the +/// ModuleSlotTracker for IR values. +class VPSlotTracker { + /// Keep track of versioned names assigned to VPValues with underlying IR + /// values. + DenseMap VPValue2Name; + /// Keep track of the next number to use to version the base name. + StringMap BaseName2Version; + + /// Number to assign to the next VPValue without underlying value. + unsigned NextSlot = 0; + + void assignName(const VPValue *V); + void assignNames(const VPlan &Plan); + void assignNames(const VPBasicBlock *VPBB); + +public: + VPSlotTracker(const VPlan *Plan = nullptr) { + if (Plan) + assignNames(*Plan); + } + + /// Returns the name assigned to \p V, if there is one, otherwise try to + /// construct one from the underlying value, if there's one; else return + /// . + std::string getOrCreateName(const VPValue *V) const; +}; + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +/// VPlanPrinter prints a given VPlan to a given output stream. The printing is +/// indented and follows the dot format. +class VPlanPrinter { + raw_ostream &OS; + const VPlan &Plan; + unsigned Depth = 0; + unsigned TabWidth = 2; + std::string Indent; + unsigned BID = 0; + SmallDenseMap BlockID; + + VPSlotTracker SlotTracker; + + /// Handle indentation. + void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); } + + /// Print a given \p Block of the Plan. + void dumpBlock(const VPBlockBase *Block); + + /// Print the information related to the CFG edges going out of a given + /// \p Block, followed by printing the successor blocks themselves. + void dumpEdges(const VPBlockBase *Block); + + /// Print a given \p BasicBlock, including its VPRecipes, followed by printing + /// its successor blocks. + void dumpBasicBlock(const VPBasicBlock *BasicBlock); + + /// Print a given \p Region of the Plan. + void dumpRegion(const VPRegionBlock *Region); + + unsigned getOrCreateBID(const VPBlockBase *Block) { + return BlockID.count(Block) ? BlockID[Block] : BlockID[Block] = BID++; + } + + Twine getOrCreateName(const VPBlockBase *Block); + + Twine getUID(const VPBlockBase *Block); + + /// Print the information related to a CFG edge between two VPBlockBases. + void drawEdge(const VPBlockBase *From, const VPBlockBase *To, bool Hidden, + const Twine &Label); + +public: + VPlanPrinter(raw_ostream &O, const VPlan &P) + : OS(O), Plan(P), SlotTracker(&P) {} + + LLVM_DUMP_METHOD void dump(); +}; +#endif + +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 81031b9401ca0..c84a93d7398f7 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -14,12 +14,14 @@ #include "LoopVectorizationPlanner.h" #include "VPlan.h" #include "VPlanAnalysis.h" +#include "VPlanHelpers.h" #include "VPlanPatternMatch.h" #include "VPlanUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Twine.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp index 98ccf21694635..e943c7a29eb83 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.cpp @@ -14,10 +14,13 @@ /// //===----------------------------------------------------------------------===// +#include "VPlanSLP.h" #include "VPlan.h" +#include "VPlanCFG.h" #include "VPlanValue.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -39,6 +42,57 @@ using namespace llvm; // Number of levels to look ahead when re-ordering multi node operands. static unsigned LookaheadMaxDepth = 5; +void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region, + Old2NewTy &Old2New, + InterleavedAccessInfo &IAI) { + ReversePostOrderTraversal> RPOT( + Region->getEntry()); + for (VPBlockBase *Base : RPOT) { + visitBlock(Base, Old2New, IAI); + } +} + +void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, + InterleavedAccessInfo &IAI) { + if (VPBasicBlock *VPBB = dyn_cast(Block)) { + for (VPRecipeBase &VPI : *VPBB) { + if (isa(&VPI)) + continue; + auto *VPInst = cast(&VPI); + auto *Inst = dyn_cast_or_null(VPInst->getUnderlyingValue()); + if (!Inst) + continue; + auto *IG = IAI.getInterleaveGroup(Inst); + if (!IG) + continue; + + auto NewIGIter = Old2New.find(IG); + if (NewIGIter == Old2New.end()) + Old2New[IG] = new InterleaveGroup( + IG->getFactor(), IG->isReverse(), IG->getAlign()); + + if (Inst == IG->getInsertPos()) + Old2New[IG]->setInsertPos(VPInst); + + InterleaveGroupMap[VPInst] = Old2New[IG]; + InterleaveGroupMap[VPInst]->insertMember( + VPInst, IG->getIndex(Inst), + Align(IG->isReverse() ? (-1) * int(IG->getFactor()) + : IG->getFactor())); + } + } else if (VPRegionBlock *Region = dyn_cast(Block)) { + visitRegion(Region, Old2New, IAI); + } else { + llvm_unreachable("Unsupported kind of VPBlock."); + } +} + +VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan, + InterleavedAccessInfo &IAI) { + Old2NewTy Old2New; + visitRegion(Plan.getVectorLoopRegion(), Old2New, IAI); +} + VPInstruction *VPlanSlp::markFailed() { // FIXME: Currently this is used to signal we hit instructions we cannot // trivially SLP'ize. diff --git a/llvm/lib/Transforms/Vectorize/VPlanSLP.h b/llvm/lib/Transforms/Vectorize/VPlanSLP.h new file mode 100644 index 0000000000000..a40ebd28deea2 --- /dev/null +++ b/llvm/lib/Transforms/Vectorize/VPlanSLP.h @@ -0,0 +1,166 @@ +//===- VPlan.h - VPlan-based SLP ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// This file contains the declarations for VPlan-based SLP. +/// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANSLP_H +#define LLVM_TRANSFORMS_VECTORIZE_VPLANSLP_H + +#include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/VectorUtils.h" + +namespace llvm { + +class VPBasicBlock; +class VPBlockBase; +class VPRegionBlock; +class VPlan; +class VPValue; +class VPInstruction; + +class VPInterleavedAccessInfo { + DenseMap *> + InterleaveGroupMap; + + /// Type for mapping of instruction based interleave groups to VPInstruction + /// interleave groups + using Old2NewTy = DenseMap *, + InterleaveGroup *>; + + /// Recursively \p Region and populate VPlan based interleave groups based on + /// \p IAI. + void visitRegion(VPRegionBlock *Region, Old2NewTy &Old2New, + InterleavedAccessInfo &IAI); + /// Recursively traverse \p Block and populate VPlan based interleave groups + /// based on \p IAI. + void visitBlock(VPBlockBase *Block, Old2NewTy &Old2New, + InterleavedAccessInfo &IAI); + +public: + VPInterleavedAccessInfo(VPlan &Plan, InterleavedAccessInfo &IAI); + + ~VPInterleavedAccessInfo() { + SmallPtrSet *, 4> DelSet; + // Avoid releasing a pointer twice. + for (auto &I : InterleaveGroupMap) + DelSet.insert(I.second); + for (auto *Ptr : DelSet) + delete Ptr; + } + + /// Get the interleave group that \p Instr belongs to. + /// + /// \returns nullptr if doesn't have such group. + InterleaveGroup * + getInterleaveGroup(VPInstruction *Instr) const { + return InterleaveGroupMap.lookup(Instr); + } +}; + +/// Class that maps (parts of) an existing VPlan to trees of combined +/// VPInstructions. +class VPlanSlp { + enum class OpMode { Failed, Load, Opcode }; + + /// A DenseMapInfo implementation for using SmallVector as + /// DenseMap keys. + struct BundleDenseMapInfo { + static SmallVector getEmptyKey() { + return {reinterpret_cast(-1)}; + } + + static SmallVector getTombstoneKey() { + return {reinterpret_cast(-2)}; + } + + static unsigned getHashValue(const SmallVector &V) { + return static_cast(hash_combine_range(V.begin(), V.end())); + } + + static bool isEqual(const SmallVector &LHS, + const SmallVector &RHS) { + return LHS == RHS; + } + }; + + /// Mapping of values in the original VPlan to a combined VPInstruction. + DenseMap, VPInstruction *, BundleDenseMapInfo> + BundleToCombined; + + VPInterleavedAccessInfo &IAI; + + /// Basic block to operate on. For now, only instructions in a single BB are + /// considered. + const VPBasicBlock &BB; + + /// Indicates whether we managed to combine all visited instructions or not. + bool CompletelySLP = true; + + /// Width of the widest combined bundle in bits. + unsigned WidestBundleBits = 0; + + using MultiNodeOpTy = + typename std::pair>; + + // Input operand bundles for the current multi node. Each multi node operand + // bundle contains values not matching the multi node's opcode. They will + // be reordered in reorderMultiNodeOps, once we completed building a + // multi node. + SmallVector MultiNodeOps; + + /// Indicates whether we are building a multi node currently. + bool MultiNodeActive = false; + + /// Check if we can vectorize Operands together. + bool areVectorizable(ArrayRef Operands) const; + + /// Add combined instruction \p New for the bundle \p Operands. + void addCombined(ArrayRef Operands, VPInstruction *New); + + /// Indicate we hit a bundle we failed to combine. Returns nullptr for now. + VPInstruction *markFailed(); + + /// Reorder operands in the multi node to maximize sequential memory access + /// and commutative operations. + SmallVector reorderMultiNodeOps(); + + /// Choose the best candidate to use for the lane after \p Last. The set of + /// candidates to choose from are values with an opcode matching \p Last's + /// or loads consecutive to \p Last. + std::pair getBest(OpMode Mode, VPValue *Last, + SmallPtrSetImpl &Candidates, + VPInterleavedAccessInfo &IAI); + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + /// Print bundle \p Values to dbgs(). + void dumpBundle(ArrayRef Values); +#endif + +public: + VPlanSlp(VPInterleavedAccessInfo &IAI, VPBasicBlock &BB) : IAI(IAI), BB(BB) {} + + ~VPlanSlp() = default; + + /// Tries to build an SLP tree rooted at \p Operands and returns a + /// VPInstruction combining \p Operands, if they can be combined. + VPInstruction *buildGraph(ArrayRef Operands); + + /// Return the width of the widest combined bundle in bits. + unsigned getWidestBundleBits() const { return WidestBundleBits; } + + /// Return true if all visited instruction can be combined. + bool isCompletelySLP() const { return CompletelySLP; } +}; +} // end namespace llvm + +#endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_H diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp index a1a2cf211abf8..7e9ef46133936 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp @@ -25,6 +25,7 @@ #include "llvm/ADT/SetVector.h" #include "llvm/ADT/TypeSwitch.h" #include "llvm/Analysis/IVDescriptors.h" +#include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/PatternMatch.h" diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h index 23e39ce89a3a4..aabc4ab571e7a 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanValue.h +++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h @@ -435,41 +435,6 @@ class VPDef { #endif }; -class VPlan; -class VPBasicBlock; - -/// This class can be used to assign names to VPValues. For VPValues without -/// underlying value, assign consecutive numbers and use those as names (wrapped -/// in vp<>). Otherwise, use the name from the underlying value (wrapped in -/// ir<>), appending a .V version number if there are multiple uses of the same -/// name. Allows querying names for VPValues for printing, similar to the -/// ModuleSlotTracker for IR values. -class VPSlotTracker { - /// Keep track of versioned names assigned to VPValues with underlying IR - /// values. - DenseMap VPValue2Name; - /// Keep track of the next number to use to version the base name. - StringMap BaseName2Version; - - /// Number to assign to the next VPValue without underlying value. - unsigned NextSlot = 0; - - void assignName(const VPValue *V); - void assignNames(const VPlan &Plan); - void assignNames(const VPBasicBlock *VPBB); - -public: - VPSlotTracker(const VPlan *Plan = nullptr) { - if (Plan) - assignNames(*Plan); - } - - /// Returns the name assigned to \p V, if there is one, otherwise try to - /// construct one from the underlying value, if there's one; else return - /// . - std::string getOrCreateName(const VPValue *V) const; -}; - } // namespace llvm #endif // LLVM_TRANSFORMS_VECTORIZE_VPLAN_VALUE_H diff --git a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp index e3c542ec5cac8..3a2658ea1e8dc 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanSlpTest.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "../lib/Transforms/Vectorize/VPlanSLP.h" #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanHCFGBuilder.h" #include "VPlanTestBase.h" diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp index e7987a95f1ca2..23ecffa2db3b7 100644 --- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp @@ -9,6 +9,7 @@ #include "../lib/Transforms/Vectorize/VPlan.h" #include "../lib/Transforms/Vectorize/VPlanCFG.h" +#include "../lib/Transforms/Vectorize/VPlanHelpers.h" #include "VPlanTestBase.h" #include "llvm/ADT/DepthFirstIterator.h" #include "llvm/ADT/PostOrderIterator.h"