diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 0d0b342505214..5296364c92f0e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -206,6 +206,12 @@ static cl::opt VectorizeNonPowerOf2( "slp-vectorize-non-power-of-2", cl::init(false), cl::Hidden, cl::desc("Try to vectorize with non-power-of-2 number of elements.")); +/// Enables vectorization of copyable elements. +static cl::opt VectorizeCopyableElements( + "slp-copyable-elements", cl::init(true), cl::Hidden, + cl::desc("Try to replace values with the idempotent instructions for " + "better vectorization.")); + // Limit the number of alias checks. The limit is chosen so that // it has no negative effect on the llvm benchmarks. static const unsigned AliasedCheckLimit = 10; @@ -855,6 +861,13 @@ static std::optional getExtractIndex(const Instruction *E) { return *EI->idx_begin(); } +namespace llvm { +/// Checks if the specified value does not require scheduling. It does not +/// require scheduling if all operands and all users do not need to be scheduled +/// in the current basic block. +static bool doesNotNeedToBeScheduled(Value *V); +} // namespace llvm + namespace { /// \returns true if \p Opcode is allowed as part of the main/alternate /// instruction for SLP vectorization. @@ -957,6 +970,33 @@ class BinOpSameOpcodeHelper { return Instruction::Xor; llvm_unreachable("Cannot find interchangeable instruction."); } + + /// Return true if the instruction can be converted to \p Opcode. + bool hasCandidateOpcode(unsigned Opcode) const { + MaskType Candidate = Mask & SeenBefore; + switch (Opcode) { + case Instruction::Shl: + return Candidate & ShlBIT; + case Instruction::AShr: + return Candidate & AShrBIT; + case Instruction::Mul: + return Candidate & MulBIT; + case Instruction::Add: + return Candidate & AddBIT; + case Instruction::Sub: + return Candidate & SubBIT; + case Instruction::And: + return Candidate & AndBIT; + case Instruction::Or: + return Candidate & OrBIT; + case Instruction::Xor: + return Candidate & XorBIT; + default: + break; + } + llvm_unreachable("Cannot find interchangeable instruction."); + } + SmallVector getOperand(const Instruction *To) const { unsigned ToOpcode = To->getOpcode(); unsigned FromOpcode = I->getOpcode(); @@ -1117,6 +1157,10 @@ class BinOpSameOpcodeHelper { AltOp.trySet(OpcodeInMaskForm, InterchangeableMask)); } unsigned getMainOpcode() const { return MainOp.getOpcode(); } + /// Checks if the list of potential opcodes includes \p Opcode. + bool hasCandidateOpcode(unsigned Opcode) const { + return MainOp.hasCandidateOpcode(Opcode); + } bool hasAltOp() const { return AltOp.I; } unsigned getAltOpcode() const { return hasAltOp() ? AltOp.getOpcode() : getMainOpcode(); @@ -1152,6 +1196,8 @@ class InstructionsState { /// GetVectorCost. Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; + /// Wether the instruction state represents copyable instructions. + bool HasCopyables = false; public: Instruction *getMainOp() const { @@ -1190,9 +1236,11 @@ class InstructionsState { if (!I->isBinaryOp()) return nullptr; BinOpSameOpcodeHelper Converter(MainOp); - if (Converter.add(I) && Converter.add(MainOp) && !Converter.hasAltOp()) - return MainOp; - return AltOp; + if (!Converter.add(I) || !Converter.add(MainOp)) + return nullptr; + if (Converter.hasAltOp() && !isAltShuffle()) + return nullptr; + return Converter.hasAltOp() ? AltOp : MainOp; } /// Checks if main/alt instructions are shift operations. @@ -1237,9 +1285,63 @@ class InstructionsState { explicit operator bool() const { return valid(); } InstructionsState() = delete; - InstructionsState(Instruction *MainOp, Instruction *AltOp) - : MainOp(MainOp), AltOp(AltOp) {} + InstructionsState(Instruction *MainOp, Instruction *AltOp, + bool HasCopyables = false) + : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {} static InstructionsState invalid() { return {nullptr, nullptr}; } + + bool isCopyableElement(Value *V) const { + assert(valid() && "InstructionsState is invalid."); + if (!HasCopyables) + return false; + if (isAltShuffle() || getOpcode() == Instruction::GetElementPtr) + return false; + auto *I = dyn_cast(V); + if (!I) + return !isa(V); + if (I->getParent() != MainOp->getParent() && + (!isVectorLikeInstWithConstOps(I) || + !isVectorLikeInstWithConstOps(MainOp))) + return true; + if (I->getOpcode() == MainOp->getOpcode()) + return false; + if (!I->isBinaryOp()) + return true; + BinOpSameOpcodeHelper Converter(MainOp); + return !Converter.add(I) || !Converter.add(MainOp) || + Converter.hasAltOp() || !Converter.hasCandidateOpcode(getOpcode()); + } + + /// Checks if the value is non-schedulable. + bool isNonSchedulable(Value *V) const { + assert(valid() && "InstructionsState is invalid."); + auto *I = dyn_cast(V); + if (!HasCopyables) + return !I || isa(I) || isVectorLikeInstWithConstOps(I) || + doesNotNeedToBeScheduled(V); + // MainOp for copyables always schedulable to correctly identify + // non-schedulable copyables. + if (isCopyableElement(V)) { + auto IsNonSchedulableCopyableElement = [this](Value *V) { + auto *I = dyn_cast(V); + return !I || isa(I) || I->getParent() != MainOp->getParent() || + (doesNotNeedToBeScheduled(I) && + // If the copyable instructions comes after MainOp + // (non-schedulable, but used in the block) - cannot vectorize + // it, will possibly generate use before def. + (isVectorLikeInstWithConstOps(I) || !MainOp->comesBefore(I))); + }; + + return IsNonSchedulableCopyableElement(V); + } + return !I || isa(I) || isVectorLikeInstWithConstOps(I) || + doesNotNeedToBeScheduled(V); + } + + bool areInstructionsWithCopyableElements() const { + assert(valid() && "InstructionsState is invalid."); + return HasCopyables; + } }; std::pair> @@ -2899,9 +3001,6 @@ class BoUpSLP { for (OperandDataVec &Ops : OpsVec) Ops.resize(NumLanes); for (unsigned Lane : seq(NumLanes)) { - Value *V = VL[Lane]; - assert((isa(V) || isa(V)) && - "Expected instruction or poison value"); // Our tree has just 3 nodes: the root and two operands. // It is therefore trivial to get the APO. We only need to check the // opcode of V and whether the operand at OpIdx is the LHS or RHS @@ -2912,17 +3011,24 @@ class BoUpSLP { // Since operand reordering is performed on groups of commutative // operations or alternating sequences (e.g., +, -), we can safely tell // the inverse operations by checking commutativity. - if (isa(V)) { + auto *I = dyn_cast(VL[Lane]); + if (!I && isa(VL[Lane])) { for (unsigned OpIdx : seq(NumOperands)) OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], true, false}; continue; } - auto [SelectedOp, Ops] = convertTo(cast(V), S); - // We cannot check commutativity by the converted instruction - // (SelectedOp) because isCommutative also examines def-use - // relationships. - bool IsInverseOperation = - !isCommutative(SelectedOp, cast(V)); + bool IsInverseOperation = false; + if (S.isCopyableElement(VL[Lane])) { + // The value is a copyable element. + IsInverseOperation = !isCommutative(MainOp); + } else { + assert(I && "Expected instruction"); + auto [SelectedOp, Ops] = convertTo(I, S); + // We cannot check commutativity by the converted instruction + // (SelectedOp) because isCommutative also examines def-use + // relationships. + IsInverseOperation = !isCommutative(SelectedOp, I); + } for (unsigned OpIdx : seq(ArgSize)) { bool APO = (OpIdx == 0) ? false : IsInverseOperation; OpsVec[OpIdx][Lane] = {Operands[OpIdx][Lane], APO, false}; @@ -3792,6 +3898,9 @@ class BoUpSLP { /// reordering of operands during buildTreeRec() and vectorizeTree(). SmallVector Operands; + /// Copyable elements of the entry node. + SmallPtrSet CopyableElements; + /// MainOp and AltOp are recorded inside. S should be obtained from /// newTreeEntry. InstructionsState S = InstructionsState::invalid(); @@ -3820,11 +3929,7 @@ class BoUpSLP { void setInterleave(unsigned Factor) { InterleaveFactor = Factor; } /// Marks the node as one that does not require scheduling. - void setDoesNotNeedToSchedule() { - assert(::doesNotNeedToSchedule(Scalars) && - "Expected to not need scheduling"); - DoesNotNeedToSchedule = true; - } + void setDoesNotNeedToSchedule() { DoesNotNeedToSchedule = true; } /// Returns true if the node is marked as one that does not require /// scheduling. bool doesNotNeedToSchedule() const { return DoesNotNeedToSchedule; } @@ -3896,6 +4001,20 @@ class BoUpSLP { bool hasState() const { return S.valid(); } + /// Add \p V to the list of copyable elements. + void addCopyableElement(Value *V) { + assert(S.isCopyableElement(V) && "Not a copyable element."); + CopyableElements.insert(V); + } + + /// Returns true if \p V is a copyable element. + bool isCopyableElement(Value *V) const { + return CopyableElements.contains(V); + } + + /// Returns true if any scalar in the list is a copyable element. + bool hasCopyableElements() const { return !CopyableElements.empty(); } + /// When ReuseReorderShuffleIndices is empty it just returns position of \p /// V within vector of Scalars. Otherwise, try to remap on its reuse index. unsigned findLaneForValue(Value *V) const { @@ -3968,6 +4087,8 @@ class BoUpSLP { for (Value *V : Scalars) dbgs().indent(2) << *V << "\n"; dbgs() << "State: "; + if (S && hasCopyableElements()) + dbgs() << "[[Copyable]] "; switch (State) { case Vectorize: if (InterleaveFactor > 0) { @@ -4145,12 +4266,20 @@ class BoUpSLP { } } } else if (!Last->isGather()) { - if (doesNotNeedToSchedule(VL)) + if (isa(S.getMainOp()) || + isVectorLikeInstWithConstOps(S.getMainOp()) || + (!S.areInstructionsWithCopyableElements() && + doesNotNeedToSchedule(VL)) || + all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) Last->setDoesNotNeedToSchedule(); SmallPtrSet Processed; for (Value *V : VL) { if (isa(V)) continue; + if (S.isCopyableElement(V)) { + Last->addCopyableElement(V); + continue; + } auto It = ScalarToTreeEntries.find(V); if (It == ScalarToTreeEntries.end()) { ScalarToTreeEntries.try_emplace(V).first->getSecond().push_back(Last); @@ -4162,16 +4291,14 @@ class BoUpSLP { } } // Update the scheduler bundle to point to this TreeEntry. - assert((!Bundle.getBundle().empty() || isa(S.getMainOp()) || - isVectorLikeInstWithConstOps(S.getMainOp()) || - Last->doesNotNeedToSchedule()) && + assert((!Bundle.getBundle().empty() || Last->doesNotNeedToSchedule()) && "Bundle and VL out of sync"); if (!Bundle.getBundle().empty()) { #if !defined(NDEBUG) || defined(EXPENSIVE_CHECKS) auto *BundleMember = Bundle.getBundle().begin(); SmallPtrSet Processed; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || !Processed.insert(V).second) + if (S.isNonSchedulable(V) || !Processed.insert(V).second) continue; ++BundleMember; } @@ -4280,7 +4407,8 @@ class BoUpSLP { /// in general. ScalarsVectorizationLegality getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const; + const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const; /// Checks if the specified list of the instructions/values can be vectorized /// and fills required data before actual scheduling of the instructions. @@ -4996,7 +5124,8 @@ class BoUpSLP { /// Build a bundle from the ScheduleData nodes corresponding to the /// scalar instruction for each lane. - ScheduleBundle &buildBundle(ArrayRef VL); + ScheduleBundle &buildBundle(ArrayRef VL, + const InstructionsState &S); /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are @@ -7882,7 +8011,7 @@ void BoUpSLP::buildExternalUses( // For each lane: for (int Lane = 0, LE = Entry->Scalars.size(); Lane != LE; ++Lane) { Value *Scalar = Entry->Scalars[Lane]; - if (!isa(Scalar)) + if (!isa(Scalar) || Entry->isCopyableElement(Scalar)) continue; // All uses must be replaced already? No need to do it again. auto It = ScalarToExtUses.find(Scalar); @@ -9612,7 +9741,8 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, PoisonValue::get(UniqueValues.front()->getType())); // Check that extended with poisons operations are still valid for // vectorization (div/rem are not allowed). - if (!getSameOpcode(PaddedUniqueValues, TLI).valid()) { + if (!S.areInstructionsWithCopyableElements() && + !getSameOpcode(PaddedUniqueValues, TLI).valid()) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); ReuseShuffleIndices.clear(); return false; @@ -9761,13 +9891,95 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, } namespace { -/// Class accepts incoming list of values and generates the list of values -/// for scheduling and list of operands for the new nodes. +/// Class accepts incoming list of values, checks if it is able to model +/// "copyable" values as compatible operations, and generates the list of values +/// for scheduling and list of operands doe the new nodes. class InstructionsCompatibilityAnalysis { DominatorTree &DT; const DataLayout &DL; const TargetTransformInfo &TTI; const TargetLibraryInfo &TLI; + unsigned MainOpcode = 0; + Instruction *MainOp = nullptr; + + /// Identifies the best candidate value, which represents main opcode + /// operation. + /// Currently the best candidate is the Add instruction with the parent + /// block with the highest DFS incoming number (block, that dominates other). + void findAndSetMainInstruction(ArrayRef VL) { + BasicBlock *Parent = nullptr; + // Checks if the instruction has supported opcode. + auto IsSupportedOpcode = [](Instruction *I) { + return I && I->getOpcode() == Instruction::Add; + }; + SmallDenseSet Operands; + for (Value *V : VL) { + auto *I = dyn_cast(V); + if (!I) + continue; + if (!DT.isReachableFromEntry(I->getParent())) + continue; + if (!MainOp) { + MainOp = I; + Parent = I->getParent(); + Operands.insert(I->op_begin(), I->op_end()); + continue; + } + if (Parent == I->getParent()) { + if (!IsSupportedOpcode(MainOp)) + MainOp = I; + if (MainOp->getOpcode() == I->getOpcode() && + doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I)) + MainOp = I; + Operands.insert(I->op_begin(), I->op_end()); + continue; + } + auto *NodeA = DT.getNode(Parent); + auto *NodeB = DT.getNode(I->getParent()); + assert(NodeA && "Should only process reachable instructions"); + assert(NodeB && "Should only process reachable instructions"); + assert((NodeA == NodeB) == + (NodeA->getDFSNumIn() == NodeB->getDFSNumIn()) && + "Different nodes should have different DFS numbers"); + if (NodeA->getDFSNumIn() < NodeB->getDFSNumIn()) { + MainOp = I; + Parent = I->getParent(); + Operands.clear(); + Operands.insert(I->op_begin(), I->op_end()); + } + } + if (!IsSupportedOpcode(MainOp) || Operands.contains(MainOp)) { + MainOp = nullptr; + return; + } + MainOpcode = MainOp->getOpcode(); + } + + /// Returns the idempotent value for the \p MainOp with the detected \p + /// MainOpcode. For Add, returns 0. For Or, it should choose between false and + /// the operand itself, since V or V == V. + Value *selectBestIdempotentValue() const { + assert(MainOpcode == Instruction::Add && "Unsupported opcode"); + return ConstantExpr::getBinOpIdentity(MainOpcode, MainOp->getType(), + !MainOp->isCommutative()); + } + + /// Returns the value and operands for the \p V, considering if it is original + /// instruction and its actual operands should be returned, or it is a + /// copyable element and its should be represented as idempotent instruction. + SmallVector getOperands(const InstructionsState &S, Value *V) const { + if (isa(V)) + return {V, V}; + if (!S.isCopyableElement(V)) + return convertTo(cast(V), S).second; + switch (MainOpcode) { + case Instruction::Add: + return {V, selectBestIdempotentValue()}; + default: + break; + } + llvm_unreachable("Unsupported opcode"); + } /// Builds operands for the original instructions. void @@ -9928,22 +10140,151 @@ class InstructionsCompatibilityAnalysis { const TargetLibraryInfo &TLI) : DT(DT), DL(DL), TTI(TTI), TLI(TLI) {} + InstructionsState + buildInstructionsState(ArrayRef VL, const BoUpSLP &R, + bool TryCopyableElementsVectorization, + bool WithProfitabilityCheck = false) { + InstructionsState S = getSameOpcode(VL, TLI); + if (S) + return S; + if (!VectorizeCopyableElements || !TryCopyableElementsVectorization) + return S; + findAndSetMainInstruction(VL); + if (!MainOp) + return InstructionsState::invalid(); + S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true); + // TODO: Remove this check once support for schulable copyables is landed. + if (any_of(VL, [&](Value *V) { + return S.isCopyableElement(V) && !S.isNonSchedulable(V); + })) + return InstructionsState::invalid(); + + if (!WithProfitabilityCheck) + return S; + // Check if it is profitable to vectorize the instruction. + SmallVector Operands = buildOperands(S, VL); + if (VL.size() == 2) { + // Check if the operands allow better vectorization. + SmallVector, 4> Candidates; + Candidates.emplace_back(Operands[0][0], Operands[0][1]); + Candidates.emplace_back(Operands[1][0], Operands[1][1]); + if (isCommutative(MainOp)) { + Candidates.emplace_back(Operands[0][0], Operands[1][1]); + Candidates.emplace_back(Operands[1][0], Operands[0][1]); + } + // No good candidates - not profitable. + if (!R.findBestRootPair(Candidates, + BoUpSLP::LookAheadHeuristics::ScoreSplat)) { + // Deeper analysis for 2 splats/constants. + SmallVector, 4> Candidates1, Candidates2; + Candidates1.emplace_back(Operands[0][0], Operands[0][1]); + Candidates2.emplace_back(Operands[1][0], Operands[1][1]); + bool Res = + R.findBestRootPair(Candidates1) && R.findBestRootPair(Candidates2); + if (!Res && isCommutative(MainOp)) { + Candidates1.clear(); + Candidates2.clear(); + Candidates1.emplace_back(Operands[0][0], Operands[1][1]); + Candidates2.emplace_back(Operands[1][0], Operands[0][1]); + Res = R.findBestRootPair(Candidates1) && + R.findBestRootPair(Candidates2); + } + if (!Res) + return InstructionsState::invalid(); + } + return S; + } + assert(Operands.size() == 2 && "Unexpected number of operands!"); + unsigned CopyableNum = + count_if(VL, [&](Value *V) { return S.isCopyableElement(V); }); + if (CopyableNum < VL.size() / 2) + return S; + // Check profitability if number of copyables > VL.size() / 2. + // 1. Reorder operands for better matching. + if (isCommutative(MainOp)) { + for (auto &Ops : Operands) { + // Make instructions the first operands. + if (!isa(Ops.front()) && isa(Ops.back())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + // Make constants the second operands. + if (isa(Ops.front())) { + std::swap(Ops.front(), Ops.back()); + continue; + } + } + } + // 2. Check, if operands can be vectorized. + if (count_if(Operands.back(), IsaPred) > 1) + return InstructionsState::invalid(); + auto CheckOperand = [&](ArrayRef Ops) { + if (allConstant(Ops) || isSplat(Ops)) + return true; + // Check if it is "almost" splat, i.e. has >= 4 elements and only single + // one is different. + constexpr unsigned Limit = 4; + if (Operands.front().size() >= Limit) { + SmallDenseMap Counters; + for (Value *V : Ops) { + if (isa(V)) + continue; + ++Counters[V]; + } + if (Counters.size() == 2 && + any_of(Counters, [&](const std::pair &C) { + return C.second == 1; + })) + return true; + } + // First operand not a constant or splat? Last attempt - check for + // potential vectorization. + InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI); + InstructionsState OpS = Analysis.buildInstructionsState( + Ops, R, /*TryCopyableElementsVectorization=*/true); + if (!OpS) + return false; + unsigned CopyableNum = + count_if(Ops, [&](Value *V) { return OpS.isCopyableElement(V); }); + return CopyableNum <= VL.size() / 2; + }; + if (!CheckOperand(Operands.front())) + return InstructionsState::invalid(); + + return S; + } + SmallVector buildOperands(const InstructionsState &S, ArrayRef VL) { assert(S && "Invalid state!"); SmallVector Operands; - buildOriginalOperands(S, VL, Operands); + if (S.areInstructionsWithCopyableElements()) { + MainOp = S.getMainOp(); + MainOpcode = S.getOpcode(); + Operands.assign(MainOp->getNumOperands(), + BoUpSLP::ValueList(VL.size(), nullptr)); + for (auto [Idx, V] : enumerate(VL)) { + SmallVector OperandsForValue = getOperands(S, V); + for (auto [OperandIdx, Operand] : enumerate(OperandsForValue)) + Operands[OperandIdx][Idx] = Operand; + } + } else { + buildOriginalOperands(S, VL, Operands); + } return Operands; } }; } // namespace -BoUpSLP::ScalarsVectorizationLegality -BoUpSLP::getScalarsVectorizationLegality(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) const { +BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( + ArrayRef VL, unsigned Depth, const EdgeInfo &UserTreeIdx, + bool TryCopyableElementsVectorization) const { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); - InstructionsState S = getSameOpcode(VL, *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + VL, *this, TryCopyableElementsVectorization, + /*WithProfitabilityCheck=*/true); // Don't go into catchswitch blocks, which can happen with PHIs. // Such blocks can only have PHIs and the catchswitch. There is no @@ -10242,9 +10583,9 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, return true; }; - ScalarsVectorizationLegality Legality = - getScalarsVectorizationLegality(VL, Depth, UserTreeIdx); - const InstructionsState &S = Legality.getInstructionsState(); + ScalarsVectorizationLegality Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/false); + InstructionsState S = Legality.getInstructionsState(); if (!Legality.isLegal()) { if (Legality.trySplitVectorize()) { auto [MainOp, AltOp] = getMainAltOpsNoStateVL(VL); @@ -10252,11 +10593,18 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, if (MainOp && AltOp && TrySplitNode(InstructionsState(MainOp, AltOp))) return; } - if (Legality.tryToFindDuplicates()) - tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx); + if (!S) + Legality = getScalarsVectorizationLegality( + VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true); + if (!Legality.isLegal()) { + if (Legality.tryToFindDuplicates()) + tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, + UserTreeIdx); - newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); - return; + newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); + return; + } + S = Legality.getInstructionsState(); } // FIXME: investigate if there are profitable cases for VL.size() <= 4. @@ -13024,7 +13372,8 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, assert(E->getOpcode() && ((allSameType(VL) && allSameBlock(VL)) || (E->getOpcode() == Instruction::GetElementPtr && - E->getMainOp()->getType()->isPointerTy())) && + E->getMainOp()->getType()->isPointerTy()) || + E->hasCopyableElements()) && "Invalid VL"); Instruction *VL0 = E->getMainOp(); unsigned ShuffleOrOp = @@ -13036,6 +13385,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, SmallBitVector UsedScalars(Sz, false); for (unsigned I = 0; I < Sz; ++I) { if (isa(UniqueValues[I]) && + !E->isCopyableElement(UniqueValues[I]) && getTreeEntries(UniqueValues[I]).front() == E) continue; UsedScalars.set(I); @@ -16075,6 +16425,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { auto *I = dyn_cast(V); if (!I) continue; + if (E->isCopyableElement(I)) + continue; if (FirstInst->getParent() == I->getParent()) { if (I->comesBefore(FirstInst)) FirstInst = I; @@ -16139,7 +16491,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return nullptr; for (Value *V : E->Scalars) { auto *I = dyn_cast(V); - if (!I || isa(I) || doesNotNeedToBeScheduled(I)) + if (!I || isa(I) || + (!E->isCopyableElement(I) && doesNotNeedToBeScheduled(I))) continue; ArrayRef Bundles = It->second->getScheduleBundles(I); if (Bundles.empty()) @@ -16158,8 +16511,8 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { [](Value *V) { return !isa(V) && isa(V); })) || - all_of(E->Scalars, [](Value *V) { - return isa(V) || + all_of(E->Scalars, [&](Value *V) { + return isa(V) || E->isCopyableElement(V) || (!isVectorLikeInstWithConstOps(V) && isUsedOutsideBlock(V)); })) Res = FindLastInst(); @@ -18640,6 +18993,7 @@ Value *BoUpSLP::vectorizeTree( TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && (TE->UserTreeIndex.UserTE->getOpcode() != Instruction::PHI || TE->UserTreeIndex.UserTE->isAltShuffle()) && + !TE->UserTreeIndex.UserTE->hasCopyableElements() && all_of(TE->UserTreeIndex.UserTE->Scalars, [](Value *V) { return isUsedOutsideBlock(V); })) { Instruction &LastInst = @@ -19182,7 +19536,7 @@ Value *BoUpSLP::vectorizeTree( if (auto *EE = dyn_cast(Scalar); EE && IgnoredExtracts.contains(EE)) continue; - if (isa(Scalar)) + if (!isa(Scalar) || Entry->isCopyableElement(Scalar)) continue; #ifndef NDEBUG Type *Ty = Scalar->getType(); @@ -19424,12 +19778,15 @@ void BoUpSLP::optimizeGatherSequence() { } BoUpSLP::ScheduleBundle & -BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL) { +BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL, + const InstructionsState &S) { auto &BundlePtr = ScheduledBundlesList.emplace_back(std::make_unique()); for (Value *V : VL) { if (doesNotNeedToBeScheduled(V)) continue; + if (S.isCopyableElement(V)) + continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " "(maybe not in same basic block)"); @@ -19450,10 +19807,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, const InstructionsState &S) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. + bool HasCopyables = S.areInstructionsWithCopyableElements(); if (isa(S.getMainOp()) || - isVectorLikeInstWithConstOps(S.getMainOp()) || doesNotNeedToSchedule(VL)) + isVectorLikeInstWithConstOps(S.getMainOp()) || + (!HasCopyables && doesNotNeedToSchedule(VL)) || + all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) return nullptr; + // TODO Remove once full support for copyables is landed. + assert(all_of(VL, + [&](Value *V) { + return !S.isCopyableElement(V) || S.isNonSchedulable(V); + }) && + "Copyable elements should not be schedulable"); // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n"); @@ -19499,7 +19865,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it @@ -19516,7 +19882,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && @@ -19541,7 +19907,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, ReSchedule = true; } - ScheduleBundle &Bundle = buildBundle(VL); + ScheduleBundle &Bundle = buildBundle(VL, S); TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle.isReady()) { for (ScheduleData *BD : Bundle.getBundle()) { @@ -19558,7 +19924,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, } ScheduledBundlesList.pop_back(); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) continue; ScheduledBundles.find(cast(V))->getSecond().pop_back(); } @@ -20187,7 +20553,7 @@ bool BoUpSLP::collectValuesToDemote( }; if (E.isGather() || !Visited.insert(&E).second || any_of(E.Scalars, [&](Value *V) { - return !isa(V) && all_of(V->users(), [&](User *U) { + return !isa(V) && all_of(V->users(), [&](User *U) { return isa(U) && !isVectorized(U); }); })) @@ -20653,7 +21019,12 @@ void BoUpSLP::computeMinimumValueSizes() { if (!IsKnownPositive) ++BitWidth1; - APInt Mask = DB->getDemandedBits(cast(Root)); + auto *I = dyn_cast(Root); + if (!I) { + MaxBitWidth = std::max(BitWidth1, MaxBitWidth); + continue; + } + APInt Mask = DB->getDemandedBits(I); unsigned BitWidth2 = Mask.getBitWidth() - Mask.countl_zero(); MaxBitWidth = std::max(std::min(BitWidth1, BitWidth2), MaxBitWidth); @@ -20982,7 +21353,9 @@ SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, for (Value *V : Chain) ValOps.insert(cast(V)->getValueOperand()); // Operands are not same/alt opcodes or non-power-of-2 uniques - exit. - InstructionsState S = getSameOpcode(ValOps.getArrayRef(), *TLI); + InstructionsCompatibilityAnalysis Analysis(*DT, *DL, *TTI, *TLI); + InstructionsState S = Analysis.buildInstructionsState( + ValOps.getArrayRef(), R, /*TryCopyableElementsVectorization=*/true); if (all_of(ValOps, IsaPred) && ValOps.size() > 1) { DenseSet Stores(Chain.begin(), Chain.end()); bool IsAllowedSize = diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index 07fdc9d8dd2fa..7408ba10cc772 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -4,9 +4,6 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[ADD:%.*]] = add i32 1, 0 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[ADD]], i32 3 -; CHECK-NEXT: [[TMP1:%.*]] = icmp ult <4 x i32> [[TMP0]], zeroinitializer ; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 ; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 @@ -17,8 +14,7 @@ define void @test() { ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i1> [[TMP1]], <4 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> [[TMP5]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> , <8 x i32> ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll index 15ba98f90f0b8..5e3d4715e99c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll @@ -7,17 +7,10 @@ define i32 @test() { ; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]] ; CHECK: [[FUNC_135_EXIT_I]]: ; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <12 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> poison, i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i32> [[TMP7]], <16 x i32> [[TMP9]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <12 x i32> [[TMP3]], <12 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP17]], <16 x i32> [[TMP8]], <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> , [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll index 1c482e079bb0f..03d76ef571d64 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/node-outside-used-only.ll @@ -4,11 +4,10 @@ define i64 @test() { ; CHECK-LABEL: define i64 @test() { ; CHECK-NEXT: [[BB:.*]]: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 0, i32 1 ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ zeroinitializer, %[[BB]] ], [ [[TMP4:%.*]], %[[BB5:.*]] ] -; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP2:%.*]] = or <2 x i32> zeroinitializer, [[TMP1]] ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP2]], <2 x i32> ; CHECK-NEXT: [[TMP4]] = or <2 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: br label %[[BB5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll index 652abef14771d..6bb52e0fc43b3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-schedulable-instructions-become-schedulable.ll @@ -7,19 +7,17 @@ define void @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: br label %[[BB1:.*]] ; CHECK: [[IF_THEN_I_I:.*]]: -; CHECK-NEXT: br label %[[BB5:.*]] +; CHECK-NEXT: br label %[[BB3:.*]] ; CHECK: [[BB1]]: ; CHECK-NEXT: [[TMP0:%.*]] = zext i1 false to i64 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> , i64 [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = add <2 x i64> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> , <4 x i64> [[TMP3]], <4 x i32> -; CHECK-NEXT: br i1 false, label %[[BB5]], label %[[BB2:.*]] -; CHECK: [[BB5]]: -; CHECK-NEXT: [[TMP6:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i64> , i64 [[TMP0]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i64> zeroinitializer, [[TMP1]] +; CHECK-NEXT: br i1 false, label %[[BB3]], label %[[BB2:.*]] +; CHECK: [[BB3]]: +; CHECK-NEXT: [[TMP4:%.*]] = phi <4 x i64> [ [[TMP2]], %[[BB1]] ], [ poison, %[[IF_THEN_I_I]] ] ; CHECK-NEXT: br label %[[BB2]] ; CHECK: [[BB2]]: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP6]], %[[BB5]] ], [ [[TMP4]], %[[BB1]] ] +; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x i64> [ [[TMP4]], %[[BB3]] ], [ [[TMP2]], %[[BB1]] ] ; CHECK-NEXT: store <4 x i64> [[TMP7]], ptr getelementptr inbounds nuw (i8, ptr null, i64 40), align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll index a4949bc67b0f1..782aada17acac 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -6,14 +6,9 @@ target triple = "x86_64-unknown-linux-gnu" define <4 x i32> @foo(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> undef, i32 [[F:%.*]], i32 0 -; CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[F]], 1 -; CHECK-NEXT: [[VECINIT1:%.*]] = insertelement <4 x i32> [[VECINIT]], i32 [[ADD]], i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[VECINIT51:%.*]] = shufflevector <4 x i32> [[VECINIT1]], <4 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[VECINIT:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[VECINIT]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[VECINIT51:%.*]] = add <4 x i32> [[TMP2]], ; CHECK-NEXT: ret <4 x i32> [[VECINIT51]] ; %vecinit = insertelement <4 x i32> undef, i32 %f, i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll index ad4daeab003f5..125c2dce32663 100644 --- a/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll +++ b/llvm/test/Transforms/SLPVectorizer/alternate-non-profitable.ll @@ -150,9 +150,9 @@ define <2 x i32> @replace_through_int_casts_ele0_only(i16 %inp, <2 x i16> %dead) define <2 x i8> @replace_through_binop_fail_cant_speculate(i8 %inp, <2 x i8> %d, <2 x i8> %any) { ; CHECK-LABEL: define <2 x i8> @replace_through_binop_fail_cant_speculate( ; CHECK-SAME: i8 [[INP:%.*]], <2 x i8> [[D:%.*]], <2 x i8> [[ANY:%.*]]) { -; CHECK-NEXT: [[ADD:%.*]] = add i8 [[INP]], 5 -; CHECK-NEXT: [[V0:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i64 0 -; CHECK-NEXT: [[V:%.*]] = insertelement <2 x i8> [[V0]], i8 [[ADD]], i64 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i8> poison, i8 [[INP]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i8> [[TMP3]], <2 x i8> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[V:%.*]] = add <2 x i8> [[TMP2]], ; CHECK-NEXT: [[DIV0:%.*]] = sdiv <2 x i8> splat (i8 -128), [[V]] ; CHECK-NEXT: [[TMP1:%.*]] = xor i8 [[INP]], 123 ; CHECK-NEXT: [[R:%.*]] = insertelement <2 x i8> [[DIV0]], i8 [[TMP1]], i64 0