diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b2d689e327c6c..3045eeb3eb48e 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -525,17 +525,17 @@ static bool isSplat(ArrayRef VL) { /// instructions, we need to use the converted opcode along with the original /// uses. /// \param I The instruction to check for commutativity -/// \param InstWithUses The instruction whose uses are analyzed for special +/// \param ValWithUses The value whose uses are analyzed for special /// patterns -static bool isCommutative(Instruction *I, Instruction *InstWithUses) { +static bool isCommutative(Instruction *I, Value *ValWithUses) { if (auto *Cmp = dyn_cast(I)) return Cmp->isCommutative(); if (auto *BO = dyn_cast(I)) return BO->isCommutative() || (BO->getOpcode() == Instruction::Sub && - !InstWithUses->hasNUsesOrMore(UsesLimit) && + !ValWithUses->hasNUsesOrMore(UsesLimit) && all_of( - InstWithUses->uses(), + ValWithUses->uses(), [](const Use &U) { // Commutative, if icmp eq/ne sub, 0 CmpPredicate Pred; @@ -552,8 +552,8 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) { Flag->isOne()); })) || (BO->getOpcode() == Instruction::FSub && - !InstWithUses->hasNUsesOrMore(UsesLimit) && - all_of(InstWithUses->uses(), [](const Use &U) { + !ValWithUses->hasNUsesOrMore(UsesLimit) && + all_of(ValWithUses->uses(), [](const Use &U) { return match(U.getUser(), m_Intrinsic(m_Specific(U.get()))); })); @@ -570,6 +570,19 @@ static bool isCommutative(Instruction *I, Instruction *InstWithUses) { /// \returns true if the instruction is commutative, false otherwise static bool isCommutative(Instruction *I) { return isCommutative(I, I); } +/// \returns number of operands of \p I, considering commutativity. Returns 2 +/// for commutative instrinsics. +/// \param I The instruction to check for commutativity +static unsigned getNumberOfPotentiallyCommutativeOps(Instruction *I) { + if (isa(I) && isCommutative(I)) { + // IntrinsicInst::isCommutative returns true if swapping the first "two" + // arguments to the intrinsic produces the same result. + constexpr unsigned IntrinsicNumOperands = 2; + return IntrinsicNumOperands; + } + return I->getNumOperands(); +} + template static std::optional getInsertExtractIndex(const Value *Inst, unsigned Offset) { @@ -862,6 +875,16 @@ static std::optional getExtractIndex(const Instruction *E) { } namespace llvm { +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all operands are either not instructions +/// or phi nodes or instructions from different blocks. +static bool areAllOperandsNonInsts(Value *V); +/// Checks if the provided value does not require scheduling. It does not +/// require scheduling if this is not an instruction or it is an instruction +/// that does not read/write memory and all users are phi nodes or instructions +/// from the different blocks. +static bool isUsedOutsideBlock(Value *V); /// Checks if the specified value does not require scheduling. It does not /// require scheduling if all operands and all users do not need to be scheduled /// in the current basic block. @@ -1307,6 +1330,7 @@ class InstructionsState { : MainOp(MainOp), AltOp(AltOp), HasCopyables(HasCopyables) {} static InstructionsState invalid() { return {nullptr, nullptr}; } + /// Checks if the value is a copyable element. bool isCopyableElement(Value *V) const { assert(valid() && "InstructionsState is invalid."); if (!HasCopyables) @@ -1338,6 +1362,8 @@ class InstructionsState { doesNotNeedToBeScheduled(V); // MainOp for copyables always schedulable to correctly identify // non-schedulable copyables. + if (getMainOp() == V) + return false; if (isCopyableElement(V)) { auto IsNonSchedulableCopyableElement = [this](Value *V) { auto *I = dyn_cast(V); @@ -1355,6 +1381,7 @@ class InstructionsState { doesNotNeedToBeScheduled(V); } + /// Checks if the state represents copyable instructions. bool areInstructionsWithCopyableElements() const { assert(valid() && "InstructionsState is invalid."); return HasCopyables; @@ -1886,6 +1913,7 @@ class BoUpSLP { class TreeEntry; class ScheduleEntity; class ScheduleData; + class ScheduleCopyableData; class ScheduleBundle; class ShuffleCostEstimator; class ShuffleInstructionBuilder; @@ -2246,6 +2274,7 @@ class BoUpSLP { operator bool() const { return UserTE != nullptr; } }; + friend struct DenseMapInfo; /// A helper class used for scoring candidates for two consecutive lanes. class LookAheadHeuristics { @@ -2384,6 +2413,11 @@ class BoUpSLP { if (C1 && C2) return LookAheadHeuristics::ScoreConstants; + // Consider constants and buildvector compatible. + if ((C1 && isa(V2)) || + (C2 && isa(V1))) + return LookAheadHeuristics::ScoreConstants; + // Extracts from consecutive indexes of the same vector better score as // the extracts could be optimized away. Value *EV1; @@ -3010,10 +3044,9 @@ class BoUpSLP { assert(S.valid() && "InstructionsState is invalid."); // IntrinsicInst::isCommutative returns true if swapping the first "two" // arguments to the intrinsic produces the same result. - constexpr unsigned IntrinsicNumOperands = 2; Instruction *MainOp = S.getMainOp(); unsigned NumOperands = MainOp->getNumOperands(); - ArgSize = isa(MainOp) ? IntrinsicNumOperands : NumOperands; + ArgSize = ::getNumberOfPotentiallyCommutativeOps(MainOp); OpsVec.resize(ArgSize); unsigned NumLanes = VL.size(); for (OperandDataVec &Ops : OpsVec) @@ -3038,7 +3071,7 @@ class BoUpSLP { bool IsInverseOperation = false; if (S.isCopyableElement(VL[Lane])) { // The value is a copyable element. - IsInverseOperation = !isCommutative(MainOp); + IsInverseOperation = !isCommutative(MainOp, VL[Lane]); } else { assert(I && "Expected instruction"); auto [SelectedOp, Ops] = convertTo(I, S); @@ -4518,8 +4551,6 @@ class BoUpSLP { bool isAliased(const MemoryLocation &Loc1, Instruction *Inst1, Instruction *Inst2) { assert(Loc1.Ptr && isSimple(Inst1) && "Expected simple first instruction."); - if (!isSimple(Inst2)) - return true; // First check if the result is already in the cache. AliasCacheKey Key = std::make_pair(Inst1, Inst2); auto Res = AliasCache.try_emplace(Key); @@ -4528,7 +4559,6 @@ class BoUpSLP { bool Aliased = isModOrRefSet(BatchAA.getModRefInfo(Inst2, Loc1)); // Store the result in the cache. Res.first->getSecond() = Aliased; - AliasCache.try_emplace(std::make_pair(Inst2, Inst1), Aliased); return Aliased; } @@ -4587,16 +4617,18 @@ class BoUpSLP { /// List of hashes of vector of loads, which are known to be non vectorizable. DenseSet ListOfKnonwnNonVectorizableLoads; - /// Represents a scheduling entity, either ScheduleData or ScheduleBundle. - /// ScheduleData used to gather dependecies for a single instructions, while - /// ScheduleBundle represents a batch of instructions, going to be groupped - /// together. + /// Represents a scheduling entity, either ScheduleData, ScheduleCopyableData + /// or ScheduleBundle. ScheduleData used to gather dependecies for a single + /// instructions, while ScheduleBundle represents a batch of instructions, + /// going to be groupped together. ScheduleCopyableData models extra user for + /// "copyable" instructions. class ScheduleEntity { friend class ScheduleBundle; friend class ScheduleData; + friend class ScheduleCopyableData; protected: - enum class Kind { ScheduleData, ScheduleBundle }; + enum class Kind { ScheduleData, ScheduleBundle, ScheduleCopyableData }; Kind getKind() const { return K; } ScheduleEntity(Kind K) : K(K) {} @@ -4615,17 +4647,79 @@ class BoUpSLP { void setSchedulingPriority(int Priority) { SchedulingPriority = Priority; } int getSchedulingPriority() const { return SchedulingPriority; } bool isReady() const { - if (auto *SD = dyn_cast(this)) + if (const auto *SD = dyn_cast(this)) return SD->isReady(); + if (const auto *CD = dyn_cast(this)) + return CD->isReady(); return cast(this)->isReady(); } + /// Returns true if the dependency information has been calculated. + /// Note that depenendency validity can vary between instructions within + /// a single bundle. + bool hasValidDependencies() const { + if (const auto *SD = dyn_cast(this)) + return SD->hasValidDependencies(); + if (const auto *CD = dyn_cast(this)) + return CD->hasValidDependencies(); + return cast(this)->hasValidDependencies(); + } + /// Gets the number of unscheduled dependencies. + int getUnscheduledDeps() const { + if (const auto *SD = dyn_cast(this)) + return SD->getUnscheduledDeps(); + if (const auto *CD = dyn_cast(this)) + return CD->getUnscheduledDeps(); + return cast(this)->unscheduledDepsInBundle(); + } + /// Increments the number of unscheduled dependencies. + int incrementUnscheduledDeps(int Incr) { + if (auto *SD = dyn_cast(this)) + return SD->incrementUnscheduledDeps(Incr); + return cast(this)->incrementUnscheduledDeps(Incr); + } + /// Gets the number of dependencies. + int getDependencies() const { + if (const auto *SD = dyn_cast(this)) + return SD->getDependencies(); + return cast(this)->getDependencies(); + } + /// Gets the instruction. + Instruction *getInst() const { + if (const auto *SD = dyn_cast(this)) + return SD->getInst(); + return cast(this)->getInst(); + } + /// Gets/sets if the bundle is scheduled. bool isScheduled() const { return IsScheduled; } void setScheduled(bool Scheduled) { IsScheduled = Scheduled; } static bool classof(const ScheduleEntity *) { return true; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(raw_ostream &OS) const { + if (const auto *SD = dyn_cast(this)) + return SD->dump(OS); + if (const auto *CD = dyn_cast(this)) + return CD->dump(OS); + return cast(this)->dump(OS); + } + + LLVM_DUMP_METHOD void dump() const { + dump(dbgs()); + dbgs() << '\n'; + } +#endif // if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) }; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + friend inline raw_ostream &operator<<(raw_ostream &OS, + const BoUpSLP::ScheduleEntity &SE) { + SE.dump(OS); + return OS; + } +#endif + /// Contains all scheduling relevant data for an instruction. /// A ScheduleData either represents a single instruction or a member of an /// instruction bundle (= a group of instructions which is combined into a @@ -4688,10 +4782,18 @@ class BoUpSLP { /// Clears all dependency information. void clearDependencies() { - Dependencies = InvalidDeps; - resetUnscheduledDeps(); + clearDirectDependencies(); MemoryDependencies.clear(); ControlDependencies.clear(); + } + + /// Clears all direct dependencies only, except for control and memory + /// dependencies. + /// Required for copyable elements to correctly handle control/memory deps + /// and avoid extra reclaculation of such deps. + void clearDirectDependencies() { + Dependencies = InvalidDeps; + resetUnscheduledDeps(); IsScheduled = false; } @@ -4781,7 +4883,7 @@ class BoUpSLP { class ScheduleBundle final : public ScheduleEntity { /// The schedule data for the instructions in the bundle. - SmallVector Bundle; + SmallVector Bundle; /// True if this bundle is valid. bool IsValid = true; /// The TreeEntry that this instruction corresponds to. @@ -4797,7 +4899,7 @@ class BoUpSLP { /// Verify basic self consistency properties void verify() const { - for (const ScheduleData *SD : Bundle) { + for (const ScheduleEntity *SD : Bundle) { if (SD->hasValidDependencies()) { assert(SD->getUnscheduledDeps() <= SD->getDependencies() && "invariant"); @@ -4817,7 +4919,7 @@ class BoUpSLP { int unscheduledDepsInBundle() const { assert(*this && "bundle must not be empty"); int Sum = 0; - for (const ScheduleData *BundleMember : Bundle) { + for (const ScheduleEntity *BundleMember : Bundle) { if (BundleMember->getUnscheduledDeps() == ScheduleData::InvalidDeps) return ScheduleData::InvalidDeps; Sum += BundleMember->getUnscheduledDeps(); @@ -4829,7 +4931,7 @@ class BoUpSLP { /// Note that depenendency validity can vary between instructions within /// a single bundle. bool hasValidDependencies() const { - return all_of(Bundle, [](const ScheduleData *SD) { + return all_of(Bundle, [](const ScheduleEntity *SD) { return SD->hasValidDependencies(); }); } @@ -4843,10 +4945,10 @@ class BoUpSLP { /// Returns the bundle of scheduling data, associated with the current /// instruction. - ArrayRef getBundle() { return Bundle; } - ArrayRef getBundle() const { return Bundle; } + ArrayRef getBundle() { return Bundle; } + ArrayRef getBundle() const { return Bundle; } /// Adds an instruction to the bundle. - void add(ScheduleData *SD) { Bundle.push_back(SD); } + void add(ScheduleEntity *SD) { Bundle.push_back(SD); } /// Gets/sets the associated tree entry. void setTreeEntry(TreeEntry *TE) { this->TE = TE; } @@ -4863,8 +4965,11 @@ class BoUpSLP { return; } OS << '['; - interleaveComma(Bundle, OS, - [&](const ScheduleData *SD) { OS << *SD->getInst(); }); + interleaveComma(Bundle, OS, [&](const ScheduleEntity *SD) { + if (isa(SD)) + OS << ""; + OS << *SD->getInst(); + }); OS << ']'; } @@ -4883,6 +4988,129 @@ class BoUpSLP { } #endif + /// Contains all scheduling relevant data for the copyable instruction. + /// It models the virtual instructions, supposed to replace the original + /// instructions. E.g., if instruction %0 = load is a part of the bundle [%0, + /// %1], where %1 = add, then the ScheduleCopyableData models virtual + /// instruction %virt = add %0, 0. + class ScheduleCopyableData final : public ScheduleEntity { + /// The source schedule data for the instruction. + Instruction *Inst = nullptr; + /// The edge information for the instruction. + const EdgeInfo EI; + /// This ScheduleData is in the current scheduling region if this matches + /// the current SchedulingRegionID of BlockScheduling. + int SchedulingRegionID = 0; + /// Bundle, this data is part of. + ScheduleBundle &Bundle; + + public: + ScheduleCopyableData(int BlockSchedulingRegionID, Instruction *I, + const EdgeInfo &EI, ScheduleBundle &Bundle) + : ScheduleEntity(Kind::ScheduleCopyableData), Inst(I), EI(EI), + SchedulingRegionID(BlockSchedulingRegionID), Bundle(Bundle) {} + static bool classof(const ScheduleEntity *Entity) { + return Entity->getKind() == Kind::ScheduleCopyableData; + } + + /// Verify basic self consistency properties + void verify() { + if (hasValidDependencies()) { + assert(UnscheduledDeps <= Dependencies && "invariant"); + } else { + assert(UnscheduledDeps == Dependencies && "invariant"); + } + + if (IsScheduled) { + assert(hasValidDependencies() && UnscheduledDeps == 0 && + "unexpected scheduled state"); + } + } + + /// Returns true if the dependency information has been calculated. + /// Note that depenendency validity can vary between instructions within + /// a single bundle. + bool hasValidDependencies() const { + return Dependencies != ScheduleData::InvalidDeps; + } + + /// Returns true if it is ready for scheduling, i.e. it has no more + /// unscheduled depending instructions/bundles. + bool isReady() const { return UnscheduledDeps == 0 && !IsScheduled; } + + /// Modifies the number of unscheduled dependencies for this instruction, + /// and returns the number of remaining dependencies for the containing + /// bundle. + int incrementUnscheduledDeps(int Incr) { + assert(hasValidDependencies() && + "increment of unscheduled deps would be meaningless"); + UnscheduledDeps += Incr; + assert(UnscheduledDeps >= 0 && "invariant"); + return UnscheduledDeps; + } + + /// Sets the number of unscheduled dependencies to the number of + /// dependencies. + void resetUnscheduledDeps() { UnscheduledDeps = Dependencies; } + + /// Gets the number of unscheduled dependencies. + int getUnscheduledDeps() const { return UnscheduledDeps; } + /// Gets the number of dependencies. + int getDependencies() const { return Dependencies; } + /// Initializes the number of dependencies. + void initDependencies() { Dependencies = 0; } + /// Increments the number of dependencies. + void incDependencies() { Dependencies++; } + + /// Gets scheduling region ID. + int getSchedulingRegionID() const { return SchedulingRegionID; } + + /// Gets the instruction. + Instruction *getInst() const { return Inst; } + + /// Clears all dependency information. + void clearDependencies() { + Dependencies = ScheduleData::InvalidDeps; + UnscheduledDeps = ScheduleData::InvalidDeps; + IsScheduled = false; + } + + /// Gets the edge information. + const EdgeInfo &getEdgeInfo() const { return EI; } + + /// Gets the bundle. + ScheduleBundle &getBundle() { return Bundle; } + const ScheduleBundle &getBundle() const { return Bundle; } + +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump(raw_ostream &OS) const { OS << "[Copyable]" << *getInst(); } + + LLVM_DUMP_METHOD void dump() const { + dump(dbgs()); + dbgs() << '\n'; + } +#endif // !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + + private: + /// true, if it has valid dependency information. These nodes always have + /// only single dependency. + int Dependencies = ScheduleData::InvalidDeps; + + /// The number of dependencies minus the number of dependencies of scheduled + /// instructions. As soon as this is zero, the instruction/bundle gets ready + /// for scheduling. + /// Note that this is negative as long as Dependencies is not calculated. + int UnscheduledDeps = ScheduleData::InvalidDeps; + }; + +#ifndef NDEBUG + friend inline raw_ostream & + operator<<(raw_ostream &OS, const BoUpSLP::ScheduleCopyableData &SD) { + SD.dump(OS); + return OS; + } +#endif + friend struct GraphTraits; friend struct DOTGraphTraits; @@ -4909,6 +5137,10 @@ class BoUpSLP { void clear() { ScheduledBundles.clear(); ScheduledBundlesList.clear(); + ScheduleCopyableDataMap.clear(); + ScheduleCopyableDataMapByInst.clear(); + ScheduleCopyableDataMapByInstUser.clear(); + ScheduleCopyableDataMapByUsers.clear(); ReadyInsts.clear(); ScheduleStart = nullptr; ScheduleEnd = nullptr; @@ -4935,7 +5167,7 @@ class BoUpSLP { // Avoid lookup if can't possibly be in map. return nullptr; ScheduleData *SD = ScheduleDataMap.lookup(I); - if (SD && isInSchedulingRegion(SD)) + if (SD && isInSchedulingRegion(*SD)) return SD; return nullptr; } @@ -4944,6 +5176,196 @@ class BoUpSLP { return getScheduleData(dyn_cast(V)); } + /// Returns the ScheduleCopyableData for the given edge (user tree entry and + /// operand number) and value. + ScheduleCopyableData *getScheduleCopyableData(const EdgeInfo &EI, + const Value *V) const { + if (ScheduleCopyableDataMap.empty()) + return nullptr; + auto It = ScheduleCopyableDataMap.find(std::make_pair(EI, V)); + if (It == ScheduleCopyableDataMap.end()) + return nullptr; + ScheduleCopyableData *SD = It->getSecond().get(); + if (!isInSchedulingRegion(*SD)) + return nullptr; + return SD; + } + + /// Returns the ScheduleCopyableData for the given user \p User, operand + /// number and operand \p V. + SmallVector + getScheduleCopyableData(const Value *User, unsigned OperandIdx, + const Value *V) { + if (ScheduleCopyableDataMapByInstUser.empty()) + return {}; + const auto It = ScheduleCopyableDataMapByInstUser.find( + std::make_pair(std::make_pair(User, OperandIdx), V)); + if (It == ScheduleCopyableDataMapByInstUser.end()) + return {}; + SmallVector Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + /// Returns true if all operands of the given instruction \p User are + /// replaced by copyable data. + /// \param User The user instruction. + /// \param Op The operand, which might be replaced by the copyable data. + /// \param SLP The SLP tree. + /// \param NumOps The number of operands used. If the instruction uses the + /// same operand several times, check for the first use, then the second, + /// etc. + bool areAllOperandsReplacedByCopyableData(Instruction *User, + Instruction *Op, BoUpSLP &SLP, + unsigned NumOps) const { + assert(NumOps > 0 && "No operands"); + if (ScheduleCopyableDataMap.empty()) + return false; + SmallDenseMap PotentiallyReorderedEntriesCount; + SmallDenseMap OrderedEntriesCount; + for (const Use &U : User->operands()) { + if (U.get() != Op) + continue; + ArrayRef Entries = SLP.getTreeEntries(User); + if (Entries.empty()) + return false; + // Check all tree entries, if they have operands replaced by copyable + // data. + for (TreeEntry *TE : Entries) { + // Check if the user is commutative. + // The commutatives are handled later, as their oeprands can be + // reordered. + // Same applies even for non-commutative cmps, because we can invert + // their predicate potentially and, thus, reorder the operands. + bool IsCommutativeUser = + ::isCommutative(TE->getMatchingMainOpOrAltOp(User), User); + EdgeInfo EI(TE, U.getOperandNo()); + if (!IsCommutativeUser && !isa(User)) { + unsigned &OpCnt = + OrderedEntriesCount.try_emplace(TE, 0).first->getSecond(); + if (!getScheduleCopyableData(EI, Op) && OpCnt < NumOps) + return false; + // Found copyable operand - continue. + ++OpCnt; + continue; + } + ++PotentiallyReorderedEntriesCount.try_emplace(TE, 0) + .first->getSecond(); + } + } + // Check the commutative/cmp entries. + if (!PotentiallyReorderedEntriesCount.empty()) { + for (auto &P : PotentiallyReorderedEntriesCount) { + auto *It = find(P.first->Scalars, User); + assert(It != P.first->Scalars.end() && + "User is not in the tree entry"); + int Lane = std::distance(P.first->Scalars.begin(), It); + assert(Lane >= 0 && "Lane is not found"); + if (isa(User) && !P.first->ReorderIndices.empty()) + Lane = P.first->ReorderIndices[Lane]; + assert(Lane < static_cast(P.first->Scalars.size()) && + "Couldn't find extract lane"); + SmallVector OpIndices; + for (unsigned OpIdx : + seq(::getNumberOfPotentiallyCommutativeOps( + P.first->getMainOp()))) { + if (P.first->getOperand(OpIdx)[Lane] == Op && + getScheduleCopyableData(EdgeInfo(P.first, OpIdx), Op)) + --P.getSecond(); + } + } + return all_of(PotentiallyReorderedEntriesCount, + [&](const std::pair &P) { + return P.second == NumOps - 1; + }); + } + return true; + } + + SmallVector + getScheduleCopyableData(const Instruction *I) const { + if (ScheduleCopyableDataMapByInst.empty()) + return {}; + const auto It = ScheduleCopyableDataMapByInst.find(I); + if (It == ScheduleCopyableDataMapByInst.end()) + return {}; + SmallVector Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + SmallVector + getScheduleCopyableDataUsers(const Instruction *User) const { + if (ScheduleCopyableDataMapByUsers.empty()) + return {}; + const auto It = ScheduleCopyableDataMapByUsers.find(User); + if (It == ScheduleCopyableDataMapByUsers.end()) + return {}; + SmallVector Res; + for (ScheduleCopyableData *SD : It->getSecond()) { + if (isInSchedulingRegion(*SD)) + Res.push_back(SD); + } + return Res; + } + + ScheduleCopyableData &addScheduleCopyableData(const EdgeInfo &EI, + Instruction *I, + int SchedulingRegionID, + ScheduleBundle &Bundle) { + assert(!getScheduleCopyableData(EI, I) && "already in the map"); + ScheduleCopyableData *CD = + ScheduleCopyableDataMap + .try_emplace(std::make_pair(EI, I), + std::make_unique( + SchedulingRegionID, I, EI, Bundle)) + .first->getSecond() + .get(); + ScheduleCopyableDataMapByInst[I].push_back(CD); + if (EI.UserTE) { + ArrayRef Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, I); + assert(It != Op.end() && "Lane not set"); + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast(EI.UserTE->Scalars[Lane]); + ScheduleCopyableDataMapByInstUser + .try_emplace(std::make_pair(std::make_pair(In, EI.EdgeIdx), I)) + .first->getSecond() + .push_back(CD); + ScheduleCopyableDataMapByUsers.try_emplace(I) + .first->getSecond() + .insert(CD); + // Remove extra deps for users, becoming non-immediate users of the + // instruction. It may happen, if the chain of same copyable elements + // appears in the tree. + if (In == I) { + EdgeInfo UserEI = EI.UserTE->UserTreeIndex; + if (ScheduleCopyableData *UserCD = + getScheduleCopyableData(UserEI, In)) + ScheduleCopyableDataMapByUsers[I].remove(UserCD); + } + It = find(make_range(std::next(It), Op.end()), I); + } while (It != Op.end()); + } else { + ScheduleCopyableDataMapByUsers.try_emplace(I).first->getSecond().insert( + CD); + } + return *CD; + } + ArrayRef getScheduleBundles(Value *V) const { auto *I = dyn_cast(V); if (!I) @@ -4954,34 +5376,44 @@ class BoUpSLP { return It->getSecond(); } - bool isInSchedulingRegion(ScheduleData *SD) const { - return SD->getSchedulingRegionID() == SchedulingRegionID; - } - - bool isInSchedulingRegion(const ScheduleBundle &Bundle) const { - return all_of(Bundle.getBundle(), [&](const ScheduleData *BundleMember) { - return BundleMember->getSchedulingRegionID() == SchedulingRegionID; - }); + /// Returns true if the entity is in the scheduling region. + bool isInSchedulingRegion(const ScheduleEntity &SD) const { + if (const auto *Data = dyn_cast(&SD)) + return Data->getSchedulingRegionID() == SchedulingRegionID; + if (const auto *CD = dyn_cast(&SD)) + return CD->getSchedulingRegionID() == SchedulingRegionID; + return all_of(cast(SD).getBundle(), + [&](const ScheduleEntity *BundleMember) { + return isInSchedulingRegion(*BundleMember); + }); } /// Marks an instruction as scheduled and puts all dependent ready /// instructions into the ready-list. template - void schedule(ScheduleEntity *Data, ReadyListType &ReadyList) { - auto ProcessBundleMember = [&](ScheduleData *BundleMember, - ScheduleBundle *Bundle) { + void schedule(const BoUpSLP &R, const InstructionsState &S, + const EdgeInfo &EI, ScheduleEntity *Data, + ReadyListType &ReadyList) { + auto ProcessBundleMember = [&](ScheduleEntity *BundleMember, + ArrayRef Bundles) { // Handle the def-use chain dependencies. // Decrement the unscheduled counter and insert to ready list if ready. - auto DecrUnsched = [&](ScheduleData *Data, bool IsControl = false) { + auto DecrUnsched = [&](auto *Data, bool IsControl = false) { if ((IsControl || Data->hasValidDependencies()) && Data->incrementUnscheduledDeps(-1) == 0) { // There are no more unscheduled dependencies after // decrementing, so we can put the dependent instruction // into the ready list. - if (ArrayRef Bundles = - getScheduleBundles(Data->getInst()); - !Bundles.empty()) { + SmallVector CopyableBundle; + ArrayRef Bundles; + if (auto *CD = dyn_cast(Data)) { + CopyableBundle.push_back(&CD->getBundle()); + Bundles = CopyableBundle; + } else { + Bundles = getScheduleBundles(Data->getInst()); + } + if (!Bundles.empty()) { for (ScheduleBundle *Bundle : Bundles) { if (Bundle->unscheduledDepsInBundle() == 0) { assert(!Bundle->isScheduled() && @@ -4995,12 +5427,23 @@ class BoUpSLP { } assert(!Data->isScheduled() && "already scheduled bundle gets ready"); + assert(!isa(Data) && + "Expected non-copyable data"); ReadyList.insert(Data); LLVM_DEBUG(dbgs() << "SLP: gets ready: " << *Data << "\n"); } }; - auto DecrUnschedForInst = [&](Instruction *I) { + auto DecrUnschedForInst = [&](Instruction *User, unsigned OpIdx, + Instruction *I) { + if (!ScheduleCopyableDataMap.empty()) { + SmallVector CopyableData = + getScheduleCopyableData(User, OpIdx, I); + for (ScheduleCopyableData *CD : CopyableData) + DecrUnsched(CD, /*IsControl=*/false); + if (!CopyableData.empty()) + return; + } if (ScheduleData *OpSD = getScheduleData(I)) DecrUnsched(OpSD, /*IsControl=*/false); }; @@ -5008,45 +5451,101 @@ class BoUpSLP { // If BundleMember is a vector bundle, its operands may have been // reordered during buildTree(). We therefore need to get its operands // through the TreeEntry. - if (Bundle) { - // Need to search for the lane since the tree entry can be reordered. + if (!Bundles.empty()) { auto *In = BundleMember->getInst(); - int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), - find(Bundle->getTreeEntry()->Scalars, In)); - assert(Lane >= 0 && "Lane not set"); - - // Since vectorization tree is being built recursively this assertion - // ensures that the tree entry has all operands set before reaching - // this code. Couple of exceptions known at the moment are extracts - // where their second (immediate) operand is not added. Since - // immediates do not affect scheduler behavior this is considered - // okay. - assert(In && - (isa(In) || - In->getNumOperands() == - Bundle->getTreeEntry()->getNumOperands()) && - "Missed TreeEntry operands?"); - - for (unsigned OpIdx : - seq(Bundle->getTreeEntry()->getNumOperands())) - if (auto *I = dyn_cast( - Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { - LLVM_DEBUG(dbgs() - << "SLP: check for readiness (def): " << *I << "\n"); - DecrUnschedForInst(I); + // Count uses of each instruction operand. + SmallDenseMap OperandsUses; + unsigned TotalOpCount = 0; + if (isa(BundleMember)) { + // Copyable data is used only once (uses itself). + TotalOpCount = OperandsUses[In] = 1; + } else { + for (const Use &U : In->operands()) { + if (auto *I = dyn_cast(U.get())) { + auto Res = OperandsUses.try_emplace(I, 0); + ++Res.first->getSecond(); + ++TotalOpCount; + } + } + } + // Decrement the unscheduled counter and insert to ready list if + // ready. + auto DecrUnschedForInst = [&](Instruction *I, TreeEntry *UserTE, + unsigned OpIdx) { + if (!ScheduleCopyableDataMap.empty()) { + const EdgeInfo EI = {UserTE, OpIdx}; + if (ScheduleCopyableData *CD = getScheduleCopyableData(EI, I)) { + DecrUnsched(CD, /*IsControl=*/false); + return; + } + } + auto It = OperandsUses.find(I); + assert(It != OperandsUses.end() && "Operand not found"); + if (It->second > 0) { + --It->getSecond(); + assert(TotalOpCount > 0 && "No more operands to decrement"); + --TotalOpCount; + if (ScheduleData *OpSD = getScheduleData(I)) + DecrUnsched(OpSD, /*IsControl=*/false); } + }; + + for (ScheduleBundle *Bundle : Bundles) { + if (ScheduleCopyableDataMap.empty() && TotalOpCount == 0) + break; + // Need to search for the lane since the tree entry can be + // reordered. + int Lane = std::distance(Bundle->getTreeEntry()->Scalars.begin(), + find(Bundle->getTreeEntry()->Scalars, In)); + assert(Lane >= 0 && "Lane not set"); + if (isa(In) && + !Bundle->getTreeEntry()->ReorderIndices.empty()) + Lane = Bundle->getTreeEntry()->ReorderIndices[Lane]; + assert(Lane < static_cast( + Bundle->getTreeEntry()->Scalars.size()) && + "Couldn't find extract lane"); + + // Since vectorization tree is being built recursively this + // assertion ensures that the tree entry has all operands set before + // reaching this code. Couple of exceptions known at the moment are + // extracts where their second (immediate) operand is not added. + // Since immediates do not affect scheduler behavior this is + // considered okay. + assert(In && + (isa(In) || + In->getNumOperands() == + Bundle->getTreeEntry()->getNumOperands() || + Bundle->getTreeEntry()->isCopyableElement(In)) && + "Missed TreeEntry operands?"); + + for (unsigned OpIdx : + seq(Bundle->getTreeEntry()->getNumOperands())) + if (auto *I = dyn_cast( + Bundle->getTreeEntry()->getOperand(OpIdx)[Lane])) { + LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I + << "\n"); + DecrUnschedForInst(I, Bundle->getTreeEntry(), OpIdx); + } + } } else { // If BundleMember is a stand-alone instruction, no operand reordering // has taken place, so we directly access its operands. - for (Use &U : BundleMember->getInst()->operands()) + for (Use &U : BundleMember->getInst()->operands()) { if (auto *I = dyn_cast(U.get())) { LLVM_DEBUG(dbgs() << "SLP: check for readiness (def): " << *I << "\n"); - DecrUnschedForInst(I); + DecrUnschedForInst(BundleMember->getInst(), U.getOperandNo(), I); } + } } // Handle the memory dependencies. - for (ScheduleData *MemoryDep : BundleMember->getMemoryDependencies()) { + auto *SD = dyn_cast(BundleMember); + if (!SD) + return; + SmallPtrSet VisitedMemory; + for (ScheduleData *MemoryDep : SD->getMemoryDependencies()) { + if (!VisitedMemory.insert(MemoryDep).second) + continue; // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. LLVM_DEBUG(dbgs() << "SLP: check for readiness (mem): " @@ -5054,7 +5553,10 @@ class BoUpSLP { DecrUnsched(MemoryDep); } // Handle the control dependencies. - for (ScheduleData *Dep : BundleMember->getControlDependencies()) { + SmallPtrSet VisitedControl; + for (ScheduleData *Dep : SD->getControlDependencies()) { + if (!VisitedControl.insert(Dep).second) + continue; // There are no more unscheduled dependencies after decrementing, // so we can put the dependent instruction into the ready list. LLVM_DEBUG(dbgs() @@ -5065,23 +5567,29 @@ class BoUpSLP { if (auto *SD = dyn_cast(Data)) { SD->setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << *SD << "\n"); - ProcessBundleMember(SD, nullptr); + ProcessBundleMember(SD, {}); } else { ScheduleBundle &Bundle = *cast(Data); Bundle.setScheduled(/*Scheduled=*/true); LLVM_DEBUG(dbgs() << "SLP: schedule " << Bundle << "\n"); - auto AreAllBundlesScheduled = [&](const ScheduleData *SD) { - ArrayRef SDBundles = - getScheduleBundles(SD->getInst()); - return !SDBundles.empty() && - all_of(SDBundles, [&](const ScheduleBundle *SDBundle) { - return SDBundle->isScheduled(); - }); - }; - for (ScheduleData *SD : Bundle.getBundle()) { - if (AreAllBundlesScheduled(SD)) { + auto AreAllBundlesScheduled = + [&](const ScheduleEntity *SD, + ArrayRef SDBundles) { + if (isa(SD)) + return true; + return !SDBundles.empty() && + all_of(SDBundles, [&](const ScheduleBundle *SDBundle) { + return SDBundle->isScheduled(); + }); + }; + for (ScheduleEntity *SD : Bundle.getBundle()) { + ArrayRef SDBundles; + if (!isa(SD)) + SDBundles = getScheduleBundles(SD->getInst()); + if (AreAllBundlesScheduled(SD, SDBundles)) { SD->setScheduled(/*Scheduled=*/true); - ProcessBundleMember(SD, &Bundle); + ProcessBundleMember(SD, isa(SD) ? &Bundle + : SDBundles); } } } @@ -5109,7 +5617,7 @@ class BoUpSLP { auto *SD = getScheduleData(I); if (!SD) continue; - assert(isInSchedulingRegion(SD) && + assert(isInSchedulingRegion(*SD) && "primary schedule data not in window?"); SD->verify(); } @@ -5150,8 +5658,11 @@ class BoUpSLP { /// Build a bundle from the ScheduleData nodes corresponding to the /// scalar instruction for each lane. + /// \param VL The list of scalar instructions. + /// \param S The state of the instructions. + /// \param EI The edge in the SLP graph or the user node/operand number. ScheduleBundle &buildBundle(ArrayRef VL, - const InstructionsState &S); + const InstructionsState &S, const EdgeInfo &EI); /// Checks if a bundle of instructions can be scheduled, i.e. has no /// cyclic dependencies. This is only a dry-run, no instructions are @@ -5160,7 +5671,7 @@ class BoUpSLP { /// std::nullopt if \p VL is allowed to be scheduled. std::optional tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, - const InstructionsState &S); + const InstructionsState &S, const EdgeInfo &EI); /// Allocates schedule data chunk. ScheduleData *allocateScheduleDataChunks(); @@ -5200,6 +5711,48 @@ class BoUpSLP { /// ScheduleData structures are recycled. SmallDenseMap ScheduleDataMap; + /// Attaches ScheduleCopyableData to EdgeInfo (UserTreeEntry + operand + /// number) and the operand instruction, represented as copyable element. + SmallDenseMap, + std::unique_ptr> + ScheduleCopyableDataMap; + + /// Represents mapping between instruction and all related + /// ScheduleCopyableData (for all uses in the tree, represenedt as copyable + /// element). The SLP tree may contain several representations of the same + /// instruction. + SmallDenseMap> + ScheduleCopyableDataMapByInst; + + /// Represents mapping between user value and operand number, the operand + /// value and all related ScheduleCopyableData. The relation is 1:n, because + /// the same user may refernce the same operand in different tree entries + /// and the operand may be modelled by the different copyable data element. + SmallDenseMap, const Value *>, + SmallVector> + ScheduleCopyableDataMapByInstUser; + + /// Represents mapping between instruction and all related + /// ScheduleCopyableData. It represents the mapping between the actual + /// instruction and the last copyable data element in the chain. E.g., if + /// the graph models the following instructions: + /// %0 = non-add instruction ... + /// ... + /// %4 = add %3, 1 + /// %5 = add %4, 1 + /// %6 = insertelement poison, %0, 0 + /// %7 = insertelement %6, %5, 1 + /// And the graph is modeled as: + /// [%5, %0] -> [%4, copyable %0 <0> ] -> [%3, copyable %0 <1> ] + /// -> [1, 0] -> [%1, 0] + /// + /// this map will map %0 only to the copyable element <1>, which is the last + /// user (direct user of the actual instruction). <0> uses <1>, so <1> will + /// keep the map to <0>, not the %0. + SmallDenseMap> + ScheduleCopyableDataMapByUsers; + /// Attaches ScheduleBundle to Instruction. SmallDenseMap> ScheduledBundles; @@ -5246,7 +5799,7 @@ class BoUpSLP { /// Performs the "real" scheduling. Done before vectorization is actually /// performed in a basic block. - void scheduleBlock(BlockScheduling *BS); + void scheduleBlock(const BoUpSLP &R, BlockScheduling *BS); /// List of users to ignore during scheduling and that don't need extracting. const SmallDenseSet *UserIgnoreList = nullptr; @@ -5319,6 +5872,30 @@ class BoUpSLP { } // end namespace slpvectorizer +template <> struct DenseMapInfo { + using FirstInfo = DenseMapInfo; + using SecondInfo = DenseMapInfo; + static BoUpSLP::EdgeInfo getEmptyKey() { + return BoUpSLP::EdgeInfo(FirstInfo::getEmptyKey(), + SecondInfo::getEmptyKey()); + } + + static BoUpSLP::EdgeInfo getTombstoneKey() { + return BoUpSLP::EdgeInfo(FirstInfo::getTombstoneKey(), + SecondInfo::getTombstoneKey()); + } + + static unsigned getHashValue(const BoUpSLP::EdgeInfo &Val) { + return detail::combineHashValue(FirstInfo::getHashValue(Val.UserTE), + SecondInfo::getHashValue(Val.EdgeIdx)); + } + + static bool isEqual(const BoUpSLP::EdgeInfo &LHS, + const BoUpSLP::EdgeInfo &RHS) { + return LHS == RHS; + } +}; + template <> struct GraphTraits { using TreeEntry = BoUpSLP::TreeEntry; @@ -7195,12 +7772,45 @@ bool BoUpSLP::isProfitableToReorder() const { // Check if the tree has only single store and single (unordered) load node, // other nodes are phis or geps/binops, combined with phis, and/or single // gather load node - bool HasPhis = false; if (VectorizableTree.front()->hasState() && VectorizableTree.front()->getOpcode() == Instruction::PHI && VectorizableTree.front()->Scalars.size() == TinyVF && VectorizableTree.front()->getNumOperands() > PhiOpsLimit) return false; + // Single node, which require reorder - skip. + if (VectorizableTree.front()->hasState() && + VectorizableTree.front()->getOpcode() == Instruction::Store && + VectorizableTree.front()->ReorderIndices.empty()) { + const unsigned ReorderedSplitsCnt = + count_if(VectorizableTree, [&](const std::unique_ptr &TE) { + return TE->State == TreeEntry::SplitVectorize && + !TE->ReorderIndices.empty() && TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->State == TreeEntry::Vectorize && + ::isCommutative(TE->UserTreeIndex.UserTE->getMainOp()); + }); + if (ReorderedSplitsCnt <= 1 && + static_cast(count_if( + VectorizableTree, [&](const std::unique_ptr &TE) { + return ((!TE->isGather() && + (TE->ReorderIndices.empty() || + (TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->State == + TreeEntry::Vectorize && + !TE->UserTreeIndex.UserTE->ReuseShuffleIndices + .empty()))) || + (TE->isGather() && TE->ReorderIndices.empty() && + (!TE->hasState() || TE->isAltShuffle() || + TE->getOpcode() == Instruction::Load || + TE->getOpcode() == Instruction::ZExt || + TE->getOpcode() == Instruction::SExt))) && + (VectorizableTree.front()->getVectorFactor() > TinyVF || + !TE->isGather() || none_of(TE->Scalars, [&](Value *V) { + return !isConstant(V) && isVectorized(V); + })); + })) >= VectorizableTree.size() - ReorderedSplitsCnt) + return false; + } + bool HasPhis = false; bool HasLoad = true; unsigned GatherLoads = 0; for (const std::unique_ptr &TE : @@ -9772,7 +10382,8 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, }))) { if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && S.getMainOp()->isSafeToRemove() && - all_of(UniqueValues, IsaPred)) { + (S.areInstructionsWithCopyableElements() || + all_of(UniqueValues, IsaPred))) { // Find the number of elements, which forms full vectors. unsigned PWSz = getFullVectorNumberOfElements( TTI, UniqueValues.front()->getType(), UniqueValues.size()); @@ -9789,8 +10400,8 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, PaddedUniqueValues.append( PWSz - UniqueValues.size(), PoisonValue::get(UniqueValues.front()->getType())); - // Check that extended with poisons operations are still valid for - // vectorization (div/rem are not allowed). + // Check that extended with poisons/copyable operations are still valid + // for vectorization (div/rem are not allowed). if (!S.areInstructionsWithCopyableElements() && !getSameOpcode(PaddedUniqueValues, TLI).valid()) { LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); @@ -9956,12 +10567,15 @@ class InstructionsCompatibilityAnalysis { /// operation. /// Currently the best candidate is the Add instruction with the parent /// block with the highest DFS incoming number (block, that dominates other). - void findAndSetMainInstruction(ArrayRef VL) { + void findAndSetMainInstruction(ArrayRef VL, const BoUpSLP &R) { BasicBlock *Parent = nullptr; // Checks if the instruction has supported opcode. - auto IsSupportedOpcode = [](Instruction *I) { - return I && I->getOpcode() == Instruction::Add; + auto IsSupportedOpcode = [&](Instruction *I) { + return I && I->getOpcode() == Instruction::Add && + (!doesNotNeedToBeScheduled(I) || !R.isVectorized(I)); }; + // Exclude operands instructions immediately to improve compile time, it + // will be unable to schedule anyway. SmallDenseSet Operands; for (Value *V : VL) { auto *I = dyn_cast(V); @@ -9976,10 +10590,7 @@ class InstructionsCompatibilityAnalysis { continue; } if (Parent == I->getParent()) { - if (!IsSupportedOpcode(MainOp)) - MainOp = I; - if (MainOp->getOpcode() == I->getOpcode() && - doesNotNeedToBeScheduled(MainOp) && !doesNotNeedToBeScheduled(I)) + if (!IsSupportedOpcode(MainOp) && !Operands.contains(I)) MainOp = I; Operands.insert(I->op_begin(), I->op_end()); continue; @@ -10202,16 +10813,10 @@ class InstructionsCompatibilityAnalysis { return S; if (!VectorizeCopyableElements || !TryCopyableElementsVectorization) return S; - findAndSetMainInstruction(VL); + findAndSetMainInstruction(VL, R); if (!MainOp) return InstructionsState::invalid(); S = InstructionsState(MainOp, MainOp, /*HasCopyables=*/true); - // TODO: Remove this check once support for schulable copyables is landed. - if (any_of(VL, [&](Value *V) { - return S.isCopyableElement(V) && !S.isNonSchedulable(V); - })) - return InstructionsState::invalid(); - if (!WithProfitabilityCheck) return S; // Check if it is profitable to vectorize the instruction. @@ -10731,7 +11336,7 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, SetVector UniqueValues(llvm::from_range, VL); std::optional BundlePtr = - BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S); + BS.tryScheduleBundle(UniqueValues.getArrayRef(), this, S, UserTreeIdx); #ifdef EXPENSIVE_CHECKS // Make sure we didn't break any internal invariants BS.verify(); @@ -14695,6 +15300,31 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { }))))) return true; + // If the tree contains only buildvector, 2 non-buildvectors (with root user + // tree node) and other buildvectors, we can skip it. + if (!ForReduction && SLPCostThreshold.getNumOccurrences() && + VectorizableTree.front()->State == TreeEntry::SplitVectorize && + VectorizableTree.size() >= Limit && + count_if(ArrayRef(VectorizableTree).drop_front(), + [&](const std::unique_ptr &TE) { + return !TE->isGather() && TE->UserTreeIndex.UserTE && + TE->UserTreeIndex.UserTE->Idx == 0; + }) == 2) + return true; + + // If the tree contains only vectorization of the phi node from the + // buildvector - skip it. + if (!ForReduction && SLPCostThreshold.getNumOccurrences() && + VectorizableTree.size() > 2 && + VectorizableTree.front()->State == TreeEntry::Vectorize && + VectorizableTree.front()->getOpcode() == Instruction::InsertElement && + VectorizableTree[1]->State == TreeEntry::Vectorize && + VectorizableTree[1]->getOpcode() == Instruction::PHI && + all_of( + ArrayRef(VectorizableTree).drop_front(2), + [&](const std::unique_ptr &TE) { return TE->isGather(); })) + return true; + // We can vectorize the tree if its size is greater than or equal to the // minimum size specified by the MinTreeSize command line option. if (VectorizableTree.size() >= MinTreeSize) @@ -19242,7 +19872,7 @@ Value *BoUpSLP::vectorizeTree( EntryToLastInstruction.clear(); // All blocks must be scheduled before any instructions are inserted. for (auto &BSIter : BlocksSchedules) - scheduleBlock(BSIter.second.get()); + scheduleBlock(*this, BSIter.second.get()); // Cache last instructions for the nodes to avoid side effects, which may // appear during vectorization, like extra uses, etc. for (const std::unique_ptr &TE : VectorizableTree) { @@ -20049,24 +20679,29 @@ void BoUpSLP::optimizeGatherSequence() { GatherShuffleExtractSeq.clear(); } -BoUpSLP::ScheduleBundle & -BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL, - const InstructionsState &S) { +BoUpSLP::ScheduleBundle &BoUpSLP::BlockScheduling::buildBundle( + ArrayRef VL, const InstructionsState &S, const EdgeInfo &EI) { auto &BundlePtr = ScheduledBundlesList.emplace_back(std::make_unique()); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V)) + if (S.isNonSchedulable(V)) continue; - if (S.isCopyableElement(V)) + auto *I = cast(V); + if (S.isCopyableElement(V)) { + // Add a copyable element model. + ScheduleCopyableData &SD = + addScheduleCopyableData(EI, I, SchedulingRegionID, *BundlePtr); + // Group the instructions to a bundle. + BundlePtr->add(&SD); continue; + } ScheduleData *BundleMember = getScheduleData(V); assert(BundleMember && "no ScheduleData for bundle member " "(maybe not in same basic block)"); // Group the instructions to a bundle. BundlePtr->add(BundleMember); - ScheduledBundles.try_emplace(cast(V)) - .first->getSecond() - .push_back(BundlePtr.get()); + ScheduledBundles.try_emplace(I).first->getSecond().push_back( + BundlePtr.get()); } assert(BundlePtr && *BundlePtr && "Failed to find schedule bundle"); return *BundlePtr; @@ -20076,7 +20711,8 @@ BoUpSLP::BlockScheduling::buildBundle(ArrayRef VL, // and schedules instructions until the bundle gets ready. std::optional BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, - const InstructionsState &S) { + const InstructionsState &S, + const EdgeInfo &EI) { // No need to schedule PHIs, insertelement, extractelement and extractvalue // instructions. bool HasCopyables = S.areInstructionsWithCopyableElements(); @@ -20086,30 +20722,65 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, all_of(VL, [&](Value *V) { return S.isNonSchedulable(V); })) return nullptr; - // TODO Remove once full support for copyables is landed. - assert(all_of(VL, - [&](Value *V) { - return !S.isCopyableElement(V) || S.isNonSchedulable(V); - }) && - "Copyable elements should not be schedulable"); // Initialize the instruction bundle. Instruction *OldScheduleEnd = ScheduleEnd; LLVM_DEBUG(dbgs() << "SLP: bundle: " << *S.getMainOp() << "\n"); auto TryScheduleBundleImpl = [=](bool ReSchedule, ScheduleBundle &Bundle) { + // Clear deps or reculate the region, if the memory instruction is a + // copyable. It may have memory deps, which must be reaculated. + auto CheckIfNeedToClearDeps = [&](ScheduleBundle &Bundle) { + SmallDenseMap, unsigned> UserOpToNumOps; + for (ScheduleEntity *SE : Bundle.getBundle()) { + if (ScheduleCopyableData *SD = dyn_cast(SE)) { + if (ScheduleData *BundleMember = getScheduleData(SD->getInst()); + BundleMember && BundleMember->hasValidDependencies()) + BundleMember->clearDirectDependencies(); + continue; + } + auto *SD = cast(SE); + for (const Use &U : SD->getInst()->operands()) { + unsigned &NumOps = + UserOpToNumOps + .try_emplace(std::make_pair(SD->getInst(), U.get()), 0) + .first->getSecond(); + ++NumOps; + if (auto *Op = dyn_cast(U.get()); + Op && areAllOperandsReplacedByCopyableData(SD->getInst(), Op, + *SLP, NumOps)) { + if (ScheduleData *OpSD = getScheduleData(Op)) + OpSD->clearDirectDependencies(); + } + } + } + }; // The scheduling region got new instructions at the lower end (or it is a // new region for the first bundle). This makes it necessary to // recalculate all dependencies. // It is seldom that this needs to be done a second time after adding the // initial bundle to the region. if (OldScheduleEnd && ScheduleEnd != OldScheduleEnd) { - for (auto *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - if (ScheduleData *SD = getScheduleData(I)) + for_each(ScheduleDataMap, [&](auto &P) { + if (BB != P.first->getParent()) + return; + ScheduleData *SD = P.second; + if (isInSchedulingRegion(*SD)) SD->clearDependencies(); - } + }); + for_each(ScheduleCopyableDataMapByInst, [&](auto &P) { + for_each(P.second, [&](ScheduleCopyableData *SD) { + if (isInSchedulingRegion(*SD)) + SD->clearDependencies(); + }); + }); ReSchedule = true; } + // Check if the bundle data has deps for copyable elements already. In + // this case need to reset deps and recalculate it. if (Bundle && !Bundle.getBundle().empty()) { + if (S.areInstructionsWithCopyableElements() || + !ScheduleCopyableDataMap.empty()) + CheckIfNeedToClearDeps(Bundle); LLVM_DEBUG(dbgs() << "SLP: try schedule bundle " << Bundle << " in block " << BB->getName() << "\n"); calculateDependencies(Bundle, /*InsertInReadyList=*/!ReSchedule, SLP); @@ -20128,7 +20799,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, !ReadyInsts.empty()) { ScheduleEntity *Picked = ReadyInsts.pop_back_val(); assert(Picked->isReady() && "must be ready to schedule"); - schedule(Picked, ReadyInsts); + schedule(*SLP, S, EI, Picked, ReadyInsts); if (Picked == &Bundle) break; } @@ -20137,7 +20808,7 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, // Make sure that the scheduling region contains all // instructions of the bundle. for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) + if (S.isNonSchedulable(V)) continue; if (!extendSchedulingRegion(V, S)) { // If the scheduling region got new instructions at the lower end (or it @@ -20154,11 +20825,19 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, bool ReSchedule = false; for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) + if (S.isNonSchedulable(V)) continue; + SmallVector CopyableData = + getScheduleCopyableData(cast(V)); + if (!CopyableData.empty()) { + for (ScheduleCopyableData *SD : CopyableData) + ReadyInsts.remove(SD); + } ScheduleData *BundleMember = getScheduleData(V); - assert(BundleMember && + assert((BundleMember || S.isCopyableElement(V)) && "no ScheduleData for bundle member (maybe not in same basic block)"); + if (!BundleMember) + continue; // Make sure we don't leave the pieces of the bundle in the ready list when // whole bundle might not be ready. @@ -20169,20 +20848,25 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, ReadyInsts.remove(B); } - if (!BundleMember->isScheduled()) + if (!S.isCopyableElement(V) && !BundleMember->isScheduled()) continue; // A bundle member was scheduled as single instruction before and now // needs to be scheduled as part of the bundle. We just get rid of the // existing schedule. + // A bundle member has deps calculated before it was copyable element - need + // to reschedule. LLVM_DEBUG(dbgs() << "SLP: reset schedule because " << *BundleMember << " was already scheduled\n"); ReSchedule = true; } - ScheduleBundle &Bundle = buildBundle(VL, S); + ScheduleBundle &Bundle = buildBundle(VL, S, EI); TryScheduleBundleImpl(ReSchedule, Bundle); if (!Bundle.isReady()) { - for (ScheduleData *BD : Bundle.getBundle()) { + for (ScheduleEntity *BD : Bundle.getBundle()) { + // Copyable data scheduling is just removed. + if (isa(BD)) + continue; if (BD->isReady()) { ArrayRef Bundles = getScheduleBundles(BD->getInst()); if (Bundles.empty()) { @@ -20196,9 +20880,49 @@ BoUpSLP::BlockScheduling::tryScheduleBundle(ArrayRef VL, BoUpSLP *SLP, } ScheduledBundlesList.pop_back(); for (Value *V : VL) { - if (doesNotNeedToBeScheduled(V) || S.isCopyableElement(V)) + if (S.isNonSchedulable(V)) + continue; + auto *I = cast(V); + if (S.isCopyableElement(I)) { + // Remove the copyable data from the scheduling region and restore + // previous mappings. + auto KV = std::make_pair(EI, I); + assert(ScheduleCopyableDataMap.contains(KV) && + "no ScheduleCopyableData for copyable element"); + ScheduleCopyableData *SD = + ScheduleCopyableDataMapByInst.find(I)->getSecond().pop_back_val(); + ScheduleCopyableDataMapByUsers[I].remove(SD); + if (EI.UserTE) { + ArrayRef Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, I); + assert(It != Op.end() && "Lane not set"); + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast(EI.UserTE->Scalars[Lane]); + ScheduleCopyableDataMapByInstUser + [std::make_pair(std::make_pair(In, EI.EdgeIdx), I)] + .pop_back(); + It = find(make_range(std::next(It), Op.end()), I); + } while (It != Op.end()); + EdgeInfo UserEI = EI.UserTE->UserTreeIndex; + if (ScheduleCopyableData *UserCD = getScheduleCopyableData(UserEI, I)) + ScheduleCopyableDataMapByUsers[I].insert(UserCD); + } + if (ScheduleCopyableDataMapByUsers[I].empty()) + ScheduleCopyableDataMapByUsers.erase(I); + ScheduleCopyableDataMap.erase(KV); + // Need to recalculate dependencies for the actual schedule data. + if (ScheduleData *OpSD = getScheduleData(I)) + OpSD->clearDirectDependencies(); continue; - ScheduledBundles.find(cast(V))->getSecond().pop_back(); + } + ScheduledBundles.find(I)->getSecond().pop_back(); } return std::nullopt; } @@ -20218,10 +20942,6 @@ bool BoUpSLP::BlockScheduling::extendSchedulingRegion( Value *V, const InstructionsState &S) { Instruction *I = dyn_cast(V); assert(I && "bundle member must be an instruction"); - assert(!isa(I) && !isVectorLikeInstWithConstOps(I) && - !doesNotNeedToBeScheduled(I) && - "phi nodes/insertelements/extractelements/extractvalues don't need to " - "be scheduled"); if (getScheduleData(I)) return true; if (!ScheduleStart) { @@ -20291,14 +21011,14 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, ScheduleData *CurrentLoadStore = PrevLoadStore; for (Instruction *I = FromI; I != ToI; I = I->getNextNode()) { // No need to allocate data for non-schedulable instructions. - if (doesNotNeedToBeScheduled(I)) + if (isa(I)) continue; ScheduleData *SD = ScheduleDataMap.lookup(I); if (!SD) { SD = allocateScheduleDataChunks(); ScheduleDataMap[I] = SD; } - assert(!isInSchedulingRegion(SD) && + assert(!isInSchedulingRegion(*SD) && "new ScheduleData already in scheduling region"); SD->init(SchedulingRegionID, I); @@ -20331,31 +21051,122 @@ void BoUpSLP::BlockScheduling::initScheduleData(Instruction *FromI, void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, bool InsertInReadyList, BoUpSLP *SLP) { - SmallVector WorkList; - auto ProcessNode = [&](ScheduleData *BundleMember) { + SmallVector WorkList; + auto ProcessNode = [&](ScheduleEntity *SE) { + if (auto *CD = dyn_cast(SE)) { + if (CD->hasValidDependencies()) + return; + LLVM_DEBUG(dbgs() << "SLP: update deps of " << *CD << "\n"); + CD->initDependencies(); + CD->resetUnscheduledDeps(); + const EdgeInfo &EI = CD->getEdgeInfo(); + if (EI.UserTE) { + ArrayRef Op = EI.UserTE->getOperand(EI.EdgeIdx); + const auto *It = find(Op, CD->getInst()); + assert(It != Op.end() && "Lane not set"); + do { + int Lane = std::distance(Op.begin(), It); + assert(Lane >= 0 && "Lane not set"); + if (isa(EI.UserTE->Scalars[Lane]) && + !EI.UserTE->ReorderIndices.empty()) + Lane = EI.UserTE->ReorderIndices[Lane]; + assert(Lane < static_cast(EI.UserTE->Scalars.size()) && + "Couldn't find extract lane"); + auto *In = cast(EI.UserTE->Scalars[Lane]); + if (EI.UserTE->isCopyableElement(In)) { + // We may have not have related copyable scheduling data, if the + // instruction is non-schedulable. + if (ScheduleCopyableData *UseSD = + getScheduleCopyableData(EI.UserTE->UserTreeIndex, In)) { + CD->incDependencies(); + if (!UseSD->isScheduled()) + CD->incrementUnscheduledDeps(1); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); + } + } else if (ScheduleData *UseSD = getScheduleData(In)) { + CD->incDependencies(); + if (!UseSD->isScheduled()) + CD->incrementUnscheduledDeps(1); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); + } + It = find(make_range(std::next(It), Op.end()), CD->getInst()); + } while (It != Op.end()); + if (CD->isReady() && CD->getDependencies() == 0 && + (EI.UserTE->hasState() && + (EI.UserTE->getMainOp()->getParent() != + CD->getInst()->getParent() || + (isa(EI.UserTE->getMainOp()) && + (EI.UserTE->getMainOp()->hasNUsesOrMore(UsesLimit) || + any_of(EI.UserTE->getMainOp()->users(), [&](User *U) { + auto *IU = dyn_cast(U); + if (!IU) + return true; + return IU->getParent() == EI.UserTE->getMainOp()->getParent(); + })))))) { + // If no uses in the block - mark as having pseudo-use, which cannot + // be scheduled. + // Prevents incorrect def-use tracking between external user and + // actual instruction. + CD->incDependencies(); + CD->incrementUnscheduledDeps(1); + } + } + return; + } + auto *BundleMember = cast(SE); if (BundleMember->hasValidDependencies()) return; LLVM_DEBUG(dbgs() << "SLP: update deps of " << *BundleMember << "\n"); BundleMember->initDependencies(); BundleMember->resetUnscheduledDeps(); // Handle def-use chain dependencies. + SmallDenseMap UserToNumOps; for (User *U : BundleMember->getInst()->users()) { + if (isa(U)) + continue; if (ScheduleData *UseSD = getScheduleData(U)) { + // The operand is a copyable element - skip. + unsigned &NumOps = UserToNumOps.try_emplace(U, 0).first->getSecond(); + ++NumOps; + if (areAllOperandsReplacedByCopyableData( + cast(U), BundleMember->getInst(), *SLP, NumOps)) + continue; BundleMember->incDependencies(); if (!UseSD->isScheduled()) BundleMember->incrementUnscheduledDeps(1); - WorkList.push_back(UseSD); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); } } + for (ScheduleCopyableData *UseSD : + getScheduleCopyableDataUsers(BundleMember->getInst())) { + BundleMember->incDependencies(); + if (!UseSD->isScheduled()) + BundleMember->incrementUnscheduledDeps(1); + if (!UseSD->hasValidDependencies() || + (InsertInReadyList && UseSD->isReady())) + WorkList.push_back(UseSD); + } + SmallPtrSet Visited; auto MakeControlDependent = [&](Instruction *I) { + // Do not mark control dependent twice. + if (!Visited.insert(I).second) + return; auto *DepDest = getScheduleData(I); assert(DepDest && "must be in schedule window"); DepDest->addControlDependency(BundleMember); BundleMember->incDependencies(); if (!DepDest->isScheduled()) BundleMember->incrementUnscheduledDeps(1); - WorkList.push_back(DepDest); + if (!DepDest->hasValidDependencies() || + (InsertInReadyList && DepDest->isReady())) + WorkList.push_back(DepDest); }; // Any instruction which isn't safe to speculate at the beginning of the @@ -20434,7 +21245,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, for (ScheduleData *DepDest = NextLoadStore; DepDest; DepDest = DepDest->getNextLoadStore()) { - assert(isInSchedulingRegion(DepDest) && "Expected to be in region"); + assert(isInSchedulingRegion(*DepDest) && "Expected to be in region"); // We have two limits to reduce the complexity: // 1) AliasedCheckLimit: It's a small limit to reduce calls to @@ -20457,7 +21268,9 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, BundleMember->incDependencies(); if (!DepDest->isScheduled()) BundleMember->incrementUnscheduledDeps(1); - WorkList.push_back(DepDest); + if (!DepDest->hasValidDependencies() || + (InsertInReadyList && DepDest->isReady())) + WorkList.push_back(DepDest); } // Example, explaining the loop break condition: Let's assume our @@ -20482,10 +21295,18 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, WorkList.push_back(Bundle.getBundle().front()); SmallPtrSet Visited; while (!WorkList.empty()) { - ScheduleData *SD = WorkList.pop_back_val(); - ArrayRef Bundles = getScheduleBundles(SD->getInst()); + ScheduleEntity *SD = WorkList.pop_back_val(); + SmallVector CopyableBundle; + ArrayRef Bundles; + if (auto *CD = dyn_cast(SD)) { + CopyableBundle.push_back(&CD->getBundle()); + Bundles = CopyableBundle; + } else { + Bundles = getScheduleBundles(SD->getInst()); + } if (Bundles.empty()) { - ProcessNode(SD); + if (!SD->hasValidDependencies()) + ProcessNode(SD); if (InsertInReadyList && SD->isReady()) { ReadyInsts.insert(SD); LLVM_DEBUG(dbgs() << "SLP: gets ready on update: " << *SD << "\n"); @@ -20493,7 +21314,7 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, continue; } for (ScheduleBundle *Bundle : Bundles) { - if (!Visited.insert(Bundle).second || Bundle->hasValidDependencies()) + if (Bundle->hasValidDependencies() || !Visited.insert(Bundle).second) continue; assert(isInSchedulingRegion(*Bundle) && "ScheduleData not in scheduling region"); @@ -20516,23 +21337,40 @@ void BoUpSLP::BlockScheduling::calculateDependencies(ScheduleBundle &Bundle, void BoUpSLP::BlockScheduling::resetSchedule() { assert(ScheduleStart && "tried to reset schedule on block which has not been scheduled"); - for (Instruction *I = ScheduleStart; I != ScheduleEnd; I = I->getNextNode()) { - if (ScheduleData *SD = getScheduleData(I)) { - assert(isInSchedulingRegion(SD) && - "ScheduleData not in scheduling region"); + for_each(ScheduleDataMap, [&](auto &P) { + if (BB != P.first->getParent()) + return; + ScheduleData *SD = P.second; + if (isInSchedulingRegion(*SD)) { SD->setScheduled(/*Scheduled=*/false); SD->resetUnscheduledDeps(); } - for (ScheduleBundle *Bundle : getScheduleBundles(I)) { - assert(isInSchedulingRegion(*Bundle) && - "ScheduleBundle not in scheduling region"); - Bundle->setScheduled(/*Scheduled=*/false); + }); + for_each(ScheduleCopyableDataMapByInst, [&](auto &P) { + for_each(P.second, [&](ScheduleCopyableData *SD) { + if (isInSchedulingRegion(*SD)) { + SD->setScheduled(/*Scheduled=*/false); + SD->resetUnscheduledDeps(); + } + }); + }); + for_each(ScheduledBundles, [&](auto &P) { + for_each(P.second, [&](ScheduleBundle *Bundle) { + if (isInSchedulingRegion(*Bundle)) + Bundle->setScheduled(/*Scheduled=*/false); + }); + }); + // Reset schedule data for copyable elements. + for (auto &P : ScheduleCopyableDataMap) { + if (isInSchedulingRegion(*P.second.get())) { + P.second->setScheduled(/*Scheduled=*/false); + P.second->resetUnscheduledDeps(); } } ReadyInsts.clear(); } -void BoUpSLP::scheduleBlock(BlockScheduling *BS) { +void BoUpSLP::scheduleBlock(const BoUpSLP &R, BlockScheduling *BS) { if (!BS->ScheduleStart) return; @@ -20570,15 +21408,45 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { if (!Bundle->hasValidDependencies()) BS->calculateDependencies(*Bundle, /*InsertInReadyList=*/false, this); } + SmallVector SDs = BS->getScheduleCopyableData(I); + for (ScheduleCopyableData *SD : reverse(SDs)) { + ScheduleBundle &Bundle = SD->getBundle(); + Bundle.setSchedulingPriority(Idx++); + if (!Bundle.hasValidDependencies()) + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); + } continue; } + SmallVector CopyableData = + BS->getScheduleCopyableDataUsers(I); if (ScheduleData *SD = BS->getScheduleData(I)) { [[maybe_unused]] ArrayRef SDTEs = getTreeEntries(I); assert((isVectorLikeInstWithConstOps(SD->getInst()) || SDTEs.empty() || - SDTEs.front()->doesNotNeedToSchedule()) && + SDTEs.front()->doesNotNeedToSchedule() || + doesNotNeedToBeScheduled(I)) && "scheduler and vectorizer bundle mismatch"); SD->setSchedulingPriority(Idx++); - continue; + if (!SD->hasValidDependencies() && + (!CopyableData.empty() || + any_of(R.ValueToGatherNodes.lookup(I), [&](const TreeEntry *TE) { + assert(TE->isGather() && "expected gather node"); + return TE->hasState() && TE->hasCopyableElements() && + TE->isCopyableElement(I); + }))) { + // Need to calculate deps for these nodes to correctly handle copyable + // dependencies, even if they were cancelled. + // If copyables bundle was cancelled, the deps are cleared and need to + // recalculate them. + ScheduleBundle Bundle; + Bundle.add(SD); + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); + } + } + for (ScheduleCopyableData *SD : reverse(CopyableData)) { + ScheduleBundle &Bundle = SD->getBundle(); + Bundle.setSchedulingPriority(Idx++); + if (!Bundle.hasValidDependencies()) + BS->calculateDependencies(Bundle, /*InsertInReadyList=*/false, this); } } BS->initialFillReadyList(ReadyInsts); @@ -20594,9 +21462,12 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { // Move the scheduled instruction(s) to their dedicated places, if not // there yet. if (auto *Bundle = dyn_cast(Picked)) { - for (const ScheduleData *BundleMember : Bundle->getBundle()) { + for (const ScheduleEntity *BundleMember : Bundle->getBundle()) { Instruction *PickedInst = BundleMember->getInst(); - if (!Scheduled.insert(PickedInst).second) + // If copyable must be schedule as part of something else, skip it. + bool IsCopyable = Bundle->getTreeEntry()->isCopyableElement(PickedInst); + if ((IsCopyable && BS->getScheduleData(PickedInst)) || + (!IsCopyable && !Scheduled.insert(PickedInst).second)) continue; if (PickedInst->getNextNode() != LastScheduledInst) PickedInst->moveAfter(LastScheduledInst->getPrevNode()); @@ -20611,7 +21482,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) { PickedInst->moveAfter(LastScheduledInst->getPrevNode()); LastScheduledInst = PickedInst; } - BS->schedule(Picked, ReadyInsts); + auto Invalid = InstructionsState::invalid(); + BS->schedule(R, Invalid, EdgeInfo(), Picked, ReadyInsts); } // Check that we didn't break any of our invariants. diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll index 47153d91956d5..2b591a2165534 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-reorder-reshuffle.ll @@ -191,12 +191,12 @@ define i32 @reorder_indices_1(float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP1]] +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 ; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer ; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] -; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]]) +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) ; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer ; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll index 7408ba10cc772..c791a07993440 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-schedule-for-subvector.ll @@ -4,8 +4,7 @@ define void @test() { ; CHECK-LABEL: define void @test() { ; CHECK-NEXT: [[BB:.*:]] -; CHECK-NEXT: [[ICMP:%.*]] = icmp samesign ult i32 0, 0 -; CHECK-NEXT: [[SELECT:%.*]] = select i1 [[ICMP]], i32 0, i32 0 +; CHECK-NEXT: [[SELECT:%.*]] = select i1 false, i32 0, i32 0 ; CHECK-NEXT: [[ZEXT:%.*]] = zext i32 [[SELECT]] to i64 ; CHECK-NEXT: [[GETELEMENTPTR:%.*]] = getelementptr ptr addrspace(1), ptr addrspace(1) null, i64 [[ZEXT]] ; CHECK-NEXT: store ptr addrspace(1) null, ptr addrspace(1) [[GETELEMENTPTR]], align 8 @@ -13,8 +12,6 @@ define void @test() { ; CHECK-NEXT: [[CALL:%.*]] = call i32 null(<2 x double> zeroinitializer) ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> , i32 [[CALL]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <4 x i32> [[TMP2]], zeroinitializer -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i1> [[TMP3]], <4 x i1> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i1> [[TMP4]], <8 x i1> , <8 x i32> ; CHECK-NEXT: ret void ; bb: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll index c8748f316f024..2d4077f82621a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cast-operand-extracted.ll @@ -19,14 +19,14 @@ define void @test(ptr %0, i32 %add651) { ; CHECK-NEXT: [[ARRAYIDX660:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7800 ; CHECK-NEXT: [[ARRAYIDX689:%.*]] = getelementptr i8, ptr [[TMP4]], i64 7816 ; CHECK-NEXT: [[TMP6:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; CHECK-NEXT: [[TMP8:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP8]], [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = add <2 x i32> [[TMP6]], [[TMP7]] ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[ADD651]], i32 0 ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP2]], i32 1 ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP13]], <4 x i32> [[TMP19]], <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = lshr <4 x i32> [[TMP14]], splat (i32 1) +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i32> , i32 [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP21:%.*]] = add <4 x i32> [[TMP14]], [[TMP20]] +; CHECK-NEXT: [[TMP15:%.*]] = lshr <4 x i32> [[TMP21]], splat (i32 1) ; CHECK-NEXT: [[SHR685:%.*]] = lshr i32 [[TMP2]], 1 ; CHECK-NEXT: [[TMP16:%.*]] = trunc <4 x i32> [[TMP15]] to <4 x i16> ; CHECK-NEXT: [[CONV686:%.*]] = trunc i32 [[SHR685]] to i16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll index ea637bbec8955..ca14826d57826 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr35497.ll @@ -87,29 +87,29 @@ define void @pr35497(ptr %p, i64 %c) { ; AVX-LABEL: @pr35497( ; AVX-NEXT: entry: ; AVX-NEXT: [[TMP0:%.*]] = load i64, ptr [[P:%.*]], align 1 -; AVX-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[C:%.*]], i32 0 -; AVX-NEXT: [[TMP11:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <2 x i32> zeroinitializer -; AVX-NEXT: [[TMP13:%.*]] = lshr <2 x i64> [[TMP11]], splat (i64 6) +; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[C:%.*]], i32 0 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <2 x i32> zeroinitializer +; AVX-NEXT: [[TMP3:%.*]] = lshr <2 x i64> [[TMP2]], splat (i64 6) ; AVX-NEXT: [[ARRAYIDX2_2:%.*]] = getelementptr inbounds [0 x i64], ptr [[P]], i64 0, i64 4 ; AVX-NEXT: [[ARRAYIDX2_5:%.*]] = getelementptr inbounds [0 x i64], ptr [[P]], i64 0, i64 1 -; AVX-NEXT: [[TMP1:%.*]] = insertelement <2 x i64> [[TMP11]], i64 [[TMP0]], i32 1 -; AVX-NEXT: [[TMP2:%.*]] = shl <2 x i64> [[TMP1]], splat (i64 2) -; AVX-NEXT: [[TMP3:%.*]] = and <2 x i64> [[TMP2]], splat (i64 20) -; AVX-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP1]], <2 x i32> -; AVX-NEXT: [[TMP16:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> [[TMP14]], <2 x i32> -; AVX-NEXT: [[TMP6:%.*]] = add <2 x i64> [[TMP14]], [[TMP16]] -; AVX-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP6]], i32 1 -; AVX-NEXT: store i64 [[TMP17]], ptr [[P]], align 1 -; AVX-NEXT: [[TMP4:%.*]] = add nuw nsw <2 x i64> [[TMP3]], [[TMP13]] -; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP6]], i32 0 +; AVX-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP0]], i32 1 +; AVX-NEXT: [[TMP5:%.*]] = shl <2 x i64> [[TMP4]], splat (i64 2) +; AVX-NEXT: [[TMP6:%.*]] = and <2 x i64> [[TMP5]], splat (i64 20) +; AVX-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP6]], <2 x i64> [[TMP4]], <2 x i32> +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> [[TMP7]], <2 x i32> +; AVX-NEXT: [[TMP9:%.*]] = add <2 x i64> [[TMP7]], [[TMP8]] +; AVX-NEXT: [[TMP10:%.*]] = extractelement <2 x i64> [[TMP9]], i32 1 +; AVX-NEXT: store i64 [[TMP10]], ptr [[P]], align 1 +; AVX-NEXT: [[TMP11:%.*]] = add nuw nsw <2 x i64> [[TMP6]], [[TMP3]] +; AVX-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP9]], i32 0 ; AVX-NEXT: store i64 [[TMP12]], ptr [[ARRAYIDX2_5]], align 1 -; AVX-NEXT: [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], splat (i64 2) -; AVX-NEXT: [[TMP8:%.*]] = and <2 x i64> [[TMP7]], splat (i64 20) -; AVX-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP4]], i32 0 +; AVX-NEXT: [[TMP13:%.*]] = shl <2 x i64> [[TMP9]], splat (i64 2) +; AVX-NEXT: [[TMP14:%.*]] = and <2 x i64> [[TMP13]], splat (i64 20) +; AVX-NEXT: [[TMP15:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0 ; AVX-NEXT: store i64 [[TMP15]], ptr [[P]], align 1 -; AVX-NEXT: [[TMP9:%.*]] = lshr <2 x i64> [[TMP4]], splat (i64 6) -; AVX-NEXT: [[TMP10:%.*]] = add nuw nsw <2 x i64> [[TMP8]], [[TMP9]] -; AVX-NEXT: store <2 x i64> [[TMP10]], ptr [[ARRAYIDX2_2]], align 1 +; AVX-NEXT: [[TMP16:%.*]] = lshr <2 x i64> [[TMP11]], splat (i64 6) +; AVX-NEXT: [[TMP17:%.*]] = add nuw nsw <2 x i64> [[TMP14]], [[TMP16]] +; AVX-NEXT: store <2 x i64> [[TMP17]], ptr [[ARRAYIDX2_2]], align 1 ; AVX-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll index a17ccb4b46ef9..a56c6b76ba39f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shuffle-mask-emission.ll @@ -5,9 +5,11 @@ define i1 @test() { ; CHECK-LABEL: define i1 @test() { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[H_PROMOTED118_I_FR:%.*]] = freeze i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> , i32 [[H_PROMOTED118_I_FR]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> zeroinitializer, [[TMP3]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> , i32 [[H_PROMOTED118_I_FR]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] ; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i32> [[TMP5]], ; CHECK-NEXT: [[TMP7:%.*]] = icmp eq <4 x i32> [[TMP6]], diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll index 36151df96bfca..4d1f6a1aa074b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-reorder-reshuffle.ll @@ -190,12 +190,12 @@ define i32 @reorder_indices_1(float %0) { ; NON-POW2-NEXT: entry: ; NON-POW2-NEXT: [[NOR1:%.*]] = alloca [0 x [3 x float]], i32 0, align 4 ; NON-POW2-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[NOR1]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP1]] +; NON-POW2-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP3:%.*]] = fneg <3 x float> [[TMP2]] ; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> poison, float [[TMP0]], i32 0 ; NON-POW2-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <3 x i32> zeroinitializer ; NON-POW2-NEXT: [[TMP6:%.*]] = fmul <3 x float> [[TMP3]], [[TMP5]] -; NON-POW2-NEXT: [[TMP10:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> -; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP10]]) +; NON-POW2-NEXT: [[TMP7:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP1]], <3 x float> zeroinitializer, <3 x float> [[TMP6]]) ; NON-POW2-NEXT: [[TMP8:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> [[TMP7]], <3 x float> zeroinitializer) ; NON-POW2-NEXT: [[TMP9:%.*]] = fmul <3 x float> [[TMP8]], zeroinitializer ; NON-POW2-NEXT: store <3 x float> [[TMP9]], ptr [[NOR1]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll index fff988a0a746e..766e1fb50330b 100644 --- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll +++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll @@ -6,20 +6,21 @@ define i32 @test(i8 %0) { ; CHECK-SAME: i8 [[TMP0:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[CMP13_NOT_5:%.*]] = icmp eq i64 0, 0 -; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 7), align 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i8> , i8 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = load volatile i8, ptr null, align 8 -; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 9), align 1 +; CHECK-NEXT: [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 +; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(21) null, align 2 +; CHECK-NEXT: [[TMP13:%.*]] = load volatile i8, ptr null, align 2 +; CHECK-NEXT: [[TMP5:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 7), align 1 +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <2 x i8> , <2 x i8> [[TMP5]], <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = icmp eq <2 x i8> zeroinitializer, [[TMP32]] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> , <8 x i8> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <8 x i8> , <8 x i8> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP2]], i32 5 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]] -; CHECK-NEXT: [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD:%.*]] = load i48, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i48> , i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i48> [[TMP9]] to <4 x i8> ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> zeroinitializer, [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = load i8, ptr addrspace(21) null, align 2 -; CHECK-NEXT: [[TMP13:%.*]] = load volatile i8, ptr null, align 2 ; CHECK-NEXT: [[TMP14:%.*]] = load <2 x i8>, ptr addrspace(21) getelementptr inbounds (i8, ptr addrspace(21) null, i64 8), align 8 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x i8> , i8 [[TMP12]], i32 1 ; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <2 x i8> [[TMP14]], <2 x i8> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll index ac8b10a0087d0..df42cba7c8d45 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -306,13 +306,11 @@ define void @test11(<2 x i64> %0, i64 %1, <2 x i64> %2) { ; CHECK-LABEL: @test11( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i64> , [[TMP2:%.*]] -; CHECK-NEXT: [[TMP5:%.*]] = trunc <2 x i64> [[TMP4]] to <2 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i16> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i16> [[TMP6]], <4 x i16> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i16> [[TMP8]] to <4 x i8> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[TMP2:%.*]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP5]], <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = add <4 x i64> , [[TMP6]] +; CHECK-NEXT: [[TMP9:%.*]] = trunc <4 x i64> [[TMP7]] to <4 x i8> ; CHECK-NEXT: [[TMP11:%.*]] = urem <4 x i8> [[TMP9]], zeroinitializer ; CHECK-NEXT: [[TMP12:%.*]] = icmp ne <4 x i8> [[TMP11]], zeroinitializer ; CHECK-NEXT: ret void