diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 3045eeb3eb48e..3a1f92b5ba2c6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1900,8 +1900,10 @@ getNumberOfParts(const TargetTransformInfo &TTI, VectorType *VecTy, if (NumParts == 0 || NumParts >= Limit) return 1; unsigned Sz = getNumElements(VecTy); - if (NumParts >= Sz || Sz % NumParts != 0 || - !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), Sz / NumParts)) + unsigned PWSz = + getFullVectorNumberOfElements(TTI, VecTy->getElementType(), Sz); + if (NumParts >= Sz || PWSz % NumParts != 0 || + !hasFullVectorsOrPowerOf2(TTI, VecTy->getElementType(), PWSz / NumParts)) return 1; return NumParts; } @@ -2039,6 +2041,9 @@ class BoUpSLP { VectorizableTree.front()->getVectorFactor()); } + /// Returns true if the tree is a reduction tree. + bool isReductionTree() const { return UserIgnoreList; } + /// Builds external uses of the vectorized scalars, i.e. the list of /// vectorized scalars to be extracted, their lanes and their scalar users. \p /// ExternallyUsedValues contains additional list of external uses to handle @@ -2230,6 +2235,21 @@ class BoUpSLP { unsigned *BestVF = nullptr, bool TryRecursiveCheck = true) const; + /// Checks if the given array of vectorized values has the same node in the + /// tree. + bool hasSameNode(const InstructionsState &S, ArrayRef VL) const { + if (S) { + if (any_of(getTreeEntries(S.getMainOp()), + [&](const TreeEntry *TE) { return TE->isSame(VL); })) + return true; + return any_of(ValueToGatherNodes.lookup(S.getMainOp()), + [&](const TreeEntry *TE) { return TE->isSame(VL); }); + } + return any_of(VectorizableTree, [&](const std::unique_ptr &TE) { + return TE->isGather() && TE->isSame(VL); + }); + } + /// Registers non-vectorizable sequence of loads template void registerNonVectorizableLoads(ArrayRef VL) { ListOfKnonwnNonVectorizableLoads.insert(hash_value(VL)); @@ -3274,11 +3294,7 @@ class BoUpSLP { })) return false; } - // TODO: Check if we can remove a check for non-power-2 number of - // scalars after full support of non-power-2 vectorization. - return UniqueValues.size() != 2 && - hasFullVectorsOrPowerOf2(*R.TTI, Op0.front().V->getType(), - UniqueValues.size()); + return UniqueValues.size() != 2; }; // If the initial strategy fails for any of the operand indexes, then we @@ -3713,8 +3729,8 @@ class BoUpSLP { std::optional isGatherShuffledSingleRegisterEntry( const TreeEntry *TE, ArrayRef VL, MutableArrayRef Mask, - SmallVectorImpl &Entries, unsigned Part, - bool ForOrder); + SmallVectorImpl &Entries, unsigned Part, bool ForOrder, + unsigned SliceSize); /// Checks if the gathered \p VL can be represented as multi-register /// shuffle(s) of previous tree entries. @@ -4109,17 +4125,6 @@ class BoUpSLP { return IsNonPowerOf2; } - /// Return true if this is a node, which tries to vectorize number of - /// elements, forming whole vectors. - bool - hasNonWholeRegisterOrNonPowerOf2Vec(const TargetTransformInfo &TTI) const { - bool IsNonPowerOf2 = !hasFullVectorsOrPowerOf2( - TTI, getValueType(Scalars.front()), Scalars.size()); - assert((!IsNonPowerOf2 || ReuseShuffleIndices.empty()) && - "Reshuffling not supported with non-power-of-2 vectors yet."); - return IsNonPowerOf2; - } - Value *getOrdered(unsigned Idx) const { assert(isGather() && "Must be used only for buildvectors/gathers."); if (ReorderIndices.empty()) @@ -4276,12 +4281,6 @@ class BoUpSLP { if (UserTreeIdx.UserTE) OperandsToTreeEntry.try_emplace( std::make_pair(UserTreeIdx.UserTE, UserTreeIdx.EdgeIdx), Last); - // FIXME: Remove once support for ReuseShuffleIndices has been implemented - // for non-power-of-two vectors. - assert( - (hasFullVectorsOrPowerOf2(*TTI, getValueType(VL.front()), VL.size()) || - ReuseShuffleIndices.empty()) && - "Reshuffling scalars not yet supported for nodes with padding"); Last->ReuseShuffleIndices.append(ReuseShuffleIndices.begin(), ReuseShuffleIndices.end()); if (ReorderIndices.empty()) { @@ -4440,21 +4439,16 @@ class BoUpSLP { class ScalarsVectorizationLegality { InstructionsState S; bool IsLegal; - bool TryToFindDuplicates; bool TrySplitVectorize; public: ScalarsVectorizationLegality(InstructionsState S, bool IsLegal, - bool TryToFindDuplicates = true, bool TrySplitVectorize = false) - : S(S), IsLegal(IsLegal), TryToFindDuplicates(TryToFindDuplicates), - TrySplitVectorize(TrySplitVectorize) { - assert((!IsLegal || (S.valid() && TryToFindDuplicates)) && - "Inconsistent state"); + : S(S), IsLegal(IsLegal), TrySplitVectorize(TrySplitVectorize) { + assert((!IsLegal || S.valid()) && "Inconsistent state"); } const InstructionsState &getInstructionsState() const { return S; }; bool isLegal() const { return IsLegal; } - bool tryToFindDuplicates() const { return TryToFindDuplicates; } bool trySplitVectorize() const { return TrySplitVectorize; } }; @@ -6165,7 +6159,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, auto TransformMaskToOrder = [&](MutableArrayRef CurrentOrder, ArrayRef Mask, int PartSz, int NumParts, function_ref GetVF) { - for (int I : seq(0, NumParts)) { + for (int I : seq(NumParts)) { if (ShuffledSubMasks.test(I)) continue; const int VF = GetVF(I); @@ -6216,6 +6210,8 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, SecondVecFound = true; break; } + if (static_cast(I * PartSz + Idx) >= CurrentOrder.size()) + break; if (CurrentOrder[I * PartSz + Idx] > static_cast(I * PartSz + K) && CurrentOrder[I * PartSz + Idx] != @@ -6234,12 +6230,14 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, if (!ExtractShuffles.empty()) TransformMaskToOrder( CurrentOrder, ExtractMask, PartSz, NumParts, [&](unsigned I) { - if (!ExtractShuffles[I]) + if (I >= ExtractShuffles.size() || !ExtractShuffles[I]) return 0U; unsigned VF = 0; unsigned Sz = getNumElems(TE.getVectorFactor(), PartSz, I); for (unsigned Idx : seq(Sz)) { int K = I * PartSz + Idx; + if (static_cast(K) >= ExtractMask.size()) + break; if (ExtractMask[K] == PoisonMaskElem) continue; if (!TE.ReuseShuffleIndices.empty()) @@ -6267,7 +6265,7 @@ BoUpSLP::findReusedOrderedScalars(const BoUpSLP::TreeEntry &TE, } if (!Entries.empty()) TransformMaskToOrder(CurrentOrder, Mask, PartSz, NumParts, [&](unsigned I) { - if (!GatherShuffles[I]) + if (I >= GatherShuffles.size() || !GatherShuffles[I]) return 0U; return std::max(Entries[I].front()->getVectorFactor(), Entries[I].back()->getVectorFactor()); @@ -6979,12 +6977,6 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, if (!TryRecursiveCheck || VL.size() < ListLimit) return MaskedGatherCost - GatherCost >= -SLPCostThreshold; - // FIXME: The following code has not been updated for non-power-of-2 - // vectors (and not whole registers). The splitting logic here does not - // cover the original vector if the vector factor is not a power of two. - if (!hasFullVectorsOrPowerOf2(TTI, ScalarTy, VL.size())) - return false; - unsigned Sz = DL->getTypeSizeInBits(ScalarTy); unsigned MinVF = getMinVF(2 * Sz); DemandedElts.clearAllBits(); @@ -6995,8 +6987,8 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, VF >= MinVF; VF = getFloorFullVectorNumberOfElements(TTI, ScalarTy, VF - 1)) { SmallVector States; - for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); + for (unsigned Cnt = 0, End = VL.size(); Cnt < End; Cnt += VF) { + ArrayRef Slice = VL.slice(Cnt, std::min(VF, End - Cnt)); SmallVector Order; SmallVector PointerOps; LoadsState LS = @@ -7008,7 +7000,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, DemandedElts.setAllBits(); break; } - DemandedElts.setBits(Cnt, Cnt + VF); + DemandedElts.setBits(Cnt, Cnt + Slice.size()); continue; } // If need the reorder - consider as high-cost masked gather for now. @@ -7034,13 +7026,14 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, VecLdCost += TTI.getInstructionCost(cast(VL[Idx]), CostKind); } - auto *SubVecTy = getWidenedType(ScalarTy, VF); for (auto [I, LS] : enumerate(States)) { + const unsigned SliceVF = std::min(VF, VL.size() - I * VF); + auto *SubVecTy = getWidenedType(ScalarTy, SliceVF); auto *LI0 = cast(VL[I * VF]); InstructionCost VectorGEPCost = (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers) ? 0 - : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), + : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, SliceVF), LI0->getPointerOperand(), Instruction::GetElementPtr, CostKind, ScalarTy, SubVecTy) @@ -7054,12 +7047,12 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, getUnderlyingObject(PointerOps.front()); })) VectorGEPCost += getScalarizationOverhead( - TTI, ScalarTy, SubVecTy, APInt::getAllOnes(VF), + TTI, ScalarTy, SubVecTy, APInt::getAllOnes(SliceVF), /*Insert=*/true, /*Extract=*/false, CostKind); else VectorGEPCost += getScalarizationOverhead( - TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(VF, 0), + TTI, ScalarTy, SubVecTy, APInt::getOneBitSet(SliceVF, 0), /*Insert=*/true, /*Extract=*/false, CostKind) + ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, {}, CostKind); @@ -7099,7 +7092,7 @@ BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, continue; } SmallVector ShuffleMask(VL.size()); - for (int Idx : seq(0, VL.size())) + for (int Idx : seq(VL.size())) ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; if (I > 0) VecLdCost += @@ -7338,10 +7331,6 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, // No need to reorder if need to shuffle reuses, still need to shuffle the // node. if (!TE.ReuseShuffleIndices.empty()) { - // FIXME: Support ReuseShuffleIndices for non-power-of-two vectors. - assert(!TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI) && - "Reshuffling scalars not yet supported for nodes with padding"); - if (isSplat(TE.Scalars)) return std::nullopt; // Check if reuse shuffle indices can be improved by reordering. @@ -7680,12 +7669,9 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom, Res == LoadsState::CompressVectorize) return std::move(CurrentOrder); } - // FIXME: Remove the non-power-of-two check once findReusedOrderedScalars - // has been auditted for correctness with non-power-of-two vectors. - if (!VectorizeNonPowerOf2 || !TE.hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) - if (std::optional CurrentOrder = - findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) - return CurrentOrder; + if (std::optional CurrentOrder = + findReusedOrderedScalars(TE, TopToBottom, IgnoreReorder)) + return CurrentOrder; } return std::nullopt; } @@ -7969,7 +7955,7 @@ void BoUpSLP::reorderTopToBottom() { // Reorder the graph nodes according to their vectorization factor. for (unsigned VF = VectorizableTree.front()->getVectorFactor(); - !VFToOrderedEntries.empty() && VF > 1; VF -= 2 - (VF & 1U)) { + !VFToOrderedEntries.empty() && VF > 1; --VF) { auto It = VFToOrderedEntries.find(VF); if (It == VFToOrderedEntries.end()) continue; @@ -9161,17 +9147,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads( AllowToVectorize = CheckIfAllowed(Slice); } else { AllowToVectorize = - (NumElts >= 3 || - any_of(ValueToGatherNodes.at(Slice.front()), - [=](const TreeEntry *TE) { - return TE->Scalars.size() == 2 && - ((TE->Scalars.front() == Slice.front() && - TE->Scalars.back() == Slice.back()) || - (TE->Scalars.front() == Slice.back() && - TE->Scalars.back() == Slice.front())); - })) && - hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), - Slice.size()); + NumElts >= 3 || + any_of(ValueToGatherNodes.at(Slice.front()), + [=](const TreeEntry *TE) { + return TE->Scalars.size() == 2 && + ((TE->Scalars.front() == Slice.front() && + TE->Scalars.back() == Slice.back()) || + (TE->Scalars.front() == Slice.back() && + TE->Scalars.back() == Slice.front())); + }); } if (AllowToVectorize) { SmallVector PointerOps; @@ -9825,10 +9809,6 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( [[fallthrough]]; case Instruction::ExtractValue: { bool Reuse = canReuseExtract(VL, CurrentOrder); - // FIXME: Vectorizing is not supported yet for non-power-of-2 ops (and - // non-full registers). - if (!hasFullVectorsOrPowerOf2(*TTI, VL0->getType(), VL.size())) - return TreeEntry::NeedToGather; if (Reuse || !CurrentOrder.empty()) return TreeEntry::Vectorize; LLVM_DEBUG(dbgs() << "SLP: Gather extract sequence.\n"); @@ -10336,7 +10316,7 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, const TargetLibraryInfo &TLI, const InstructionsState &S, const BoUpSLP::EdgeInfo &UserTreeIdx, - bool TryPad = false) { + const BoUpSLP &R, bool BuildGatherOnly = true) { // Check that every instruction appears once in this bundle. SmallVector UniqueValues; SmallDenseMap UniquePositions(VL.size()); @@ -10355,69 +10335,152 @@ static bool tryToFindDuplicates(SmallVectorImpl &VL, UniqueValues.emplace_back(V); } + bool AreAllValuesNonConst = UniquePositions.size() == UniqueValues.size(); + + // Check if we need to schedule the scalars. If no, can keep original scalars + // and avoid extra shuffles. + bool RequireScheduling = S && S.getOpcode() != Instruction::PHI && + !isVectorLikeInstWithConstOps(S.getMainOp()) && + (S.areInstructionsWithCopyableElements() || + !doesNotNeedToSchedule(UniqueValues)); + // Drop tail poisons, if the values can be vectorized. + if (RequireScheduling) { + const auto EndIt = + find_if_not(make_range(UniqueValues.rbegin(), UniqueValues.rend()), + IsaPred); + assert(EndIt != UniqueValues.rend() && "Expected at least one non-poison."); + UniqueValues.erase(EndIt.base(), UniqueValues.end()); + } // Easy case: VL has unique values and a "natural" size size_t NumUniqueScalarValues = UniqueValues.size(); - bool IsFullVectors = hasFullVectorsOrPowerOf2( - TTI, getValueType(UniqueValues.front()), NumUniqueScalarValues); - if (NumUniqueScalarValues == VL.size() && - (VectorizeNonPowerOf2 || IsFullVectors)) { + if (NumUniqueScalarValues == VL.size()) { ReuseShuffleIndices.clear(); return true; } - // FIXME: Reshuffing scalars is not supported yet for non-power-of-2 ops. - if ((UserTreeIdx.UserTE && - UserTreeIdx.UserTE->hasNonWholeRegisterOrNonPowerOf2Vec(TTI)) || - !hasFullVectorsOrPowerOf2(TTI, getValueType(VL.front()), VL.size())) { - LLVM_DEBUG(dbgs() << "SLP: Reshuffling scalars not yet supported " - "for nodes with padding.\n"); - ReuseShuffleIndices.clear(); - return false; - } - - LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); - if (NumUniqueScalarValues <= 1 || !IsFullVectors || - (UniquePositions.size() == 1 && all_of(UniqueValues, [](Value *V) { - return isa(V) || !isConstant(V); - }))) { - if (TryPad && UniquePositions.size() > 1 && NumUniqueScalarValues > 1 && - S.getMainOp()->isSafeToRemove() && - (S.areInstructionsWithCopyableElements() || - all_of(UniqueValues, IsaPred))) { - // Find the number of elements, which forms full vectors. - unsigned PWSz = getFullVectorNumberOfElements( - TTI, UniqueValues.front()->getType(), UniqueValues.size()); - PWSz = std::min(PWSz, VL.size()); - if (PWSz == VL.size()) { - // We ended up with the same size after removing duplicates and - // upgrading the resulting vector size to a "nice size". Just keep - // the initial VL then. - ReuseShuffleIndices.clear(); - } else { - // Pad unique values with poison to grow the vector to a "nice" size - SmallVector PaddedUniqueValues(UniqueValues.begin(), - UniqueValues.end()); - PaddedUniqueValues.append( - PWSz - UniqueValues.size(), - PoisonValue::get(UniqueValues.front()->getType())); - // Check that extended with poisons/copyable operations are still valid - // for vectorization (div/rem are not allowed). - if (!S.areInstructionsWithCopyableElements() && - !getSameOpcode(PaddedUniqueValues, TLI).valid()) { - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - ReuseShuffleIndices.clear(); - return false; - } - VL = std::move(PaddedUniqueValues); + // Checks if unique inserts + shuffle is more profitable than just inserts or + // vectorized values. + auto EstimatePackPlusShuffleVsInserts = [&]() { + // Single instruction/argument insert - no shuffle. + if (UniquePositions.size() == 1 && + (NumUniqueScalarValues == 1 || + all_of(UniqueValues, IsaPred))) + return std::make_pair(false, false); + // Check if the given list of loads can be effectively vectorized. + auto CheckLoads = [&](ArrayRef VL, bool IncludeGather) { + assert(S && S.getOpcode() == Instruction::Load && "Expected load."); + BoUpSLP::OrdersType Order; + SmallVector PointerOps; + // Modified loads are gathered - use the original loads, result is the + // same, but cheaper, no shuffle. + BoUpSLP::LoadsState Res = + R.canVectorizeLoads(VL, S.getMainOp(), Order, PointerOps); + return (IncludeGather && Res == BoUpSLP::LoadsState::Gather) || + Res == BoUpSLP::LoadsState::ScatterVectorize; + }; + // If the scalars are the operands of the root node - try to vectorize them + // with shuffles, otherwise we end up with the gather node, which may be + // non-profitable/small-tree for the vectorization. + if (UserTreeIdx.UserTE && UserTreeIdx.UserTE->Idx == 0 && + !BuildGatherOnly) { + if (S && S.getOpcode() == Instruction::Load) { + // Modified loads are gathered - use the original loads, result is the + // same, but cheaper, no shuffle. + return std::make_pair( + true, CheckLoads(UniqueValues, /*IncludeGather=*/true) && + CheckLoads(VL, /*IncludeGather=*/false)); + } + return std::make_pair(true, !RequireScheduling); + } + // Mark unique scalars, to be gathered/buildvectorized. + APInt DemandedElts = APInt::getZero(VL.size()); + for_each(enumerate(ReuseShuffleIndices), [&](const auto &P) { + // Do not include constants. + if (P.value() != PoisonMaskElem && + UniquePositions.contains(UniqueValues[P.value()])) + DemandedElts.setBit(P.index()); + }); + Type *ScalarTy = UniqueValues.front()->getType(); + auto *VecTy = getWidenedType(ScalarTy, VL.size()); + auto *UniquesVecTy = getWidenedType(ScalarTy, NumUniqueScalarValues); + // No need to schedule scalars and only single register used? Use original + // scalars, do not pack. + if (!RequireScheduling) { + const unsigned NumParts = ::getNumberOfParts(TTI, VecTy); + if (VL.size() / NumUniqueScalarValues == 1 && + (NumParts <= 1 || ::getNumberOfParts(TTI, UniquesVecTy) >= NumParts)) + return std::make_pair(true, true); + } + // Check if unique loads more profitable than repeated loads. + if (S && S.getOpcode() == Instruction::Load) { + bool UniquesVectorized = + !CheckLoads(UniqueValues, /*IncludeGather=*/true); + if (UniquesVectorized || CheckLoads(VL, /*IncludeGather=*/false)) { + // Modified loads are gathered - use the original loads, result is the + // same, but cheaper, no shuffle. + return std::make_pair(true, !UniquesVectorized); } - return true; } - LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); - ReuseShuffleIndices.clear(); - return false; + constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + // Insert to poison, if no constants in scalars. + // If attempt to build vector node too, we can skip buildvector cost, + // because it will be the same for both unique and non-unique values. + InstructionCost InsertsCost = + ((!BuildGatherOnly && !RequireScheduling) || R.hasSameNode(S, VL)) + ? InstructionCost(TTI::TCC_Free) + : ::getScalarizationOverhead(TTI, ScalarTy, VecTy, DemandedElts, + /*Insert=*/true, /*Extract=*/false, + CostKind, AreAllValuesNonConst, VL); + APInt UniquesDemandedElts = APInt::getAllOnes(NumUniqueScalarValues); + for_each(seq(NumUniqueScalarValues), [&](unsigned Idx) { + // Do not include constants. + if (isConstant(UniqueValues[Idx])) + DemandedElts.clearBit(Idx); + }); + InstructionCost UniquesCost = + (!BuildGatherOnly || R.hasSameNode(S, UniqueValues)) + ? InstructionCost(TTI::TCC_Free) + : ::getScalarizationOverhead(TTI, ScalarTy, UniquesVecTy, + UniquesDemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind, + AreAllValuesNonConst, UniqueValues); + UniquesCost += ::getShuffleCost(TTI, TTI::SK_PermuteSingleSrc, VecTy, + ReuseShuffleIndices, CostKind, /*Index=*/0, + UniquesVecTy); + // If unique values are cheaper, use them. + if (UniquesCost <= InsertsCost) + return std::make_pair(true, false); + // If the cost difference is small, but the reduction cost may give some + // perf gain, prefer unique values, if we can fully vectorize it. + if (UniquesCost - InsertsCost < TTI::TCC_Expensive || + (R.getTreeSize() == 0 && R.isReductionTree() && + UniquesCost - InsertsCost == TTI::TCC_Expensive)) + return std::make_pair(S && (!S.isAltShuffle() || !BuildGatherOnly), + false); + // Otherwise, use original values, if values do not require scheduling and + // pass still try to vectorize them. + return std::make_pair(!BuildGatherOnly && !RequireScheduling, + !BuildGatherOnly && !RequireScheduling); + }; + + const auto [DoPack, UseOriginal] = EstimatePackPlusShuffleVsInserts(); + + if (DoPack) { + if (UseOriginal) { + // Prefer original scalars - avoid shuffling. + ReuseShuffleIndices.clear(); + } else { + // Better to use uniques + reshuffle. + LLVM_DEBUG(dbgs() << "SLP: Shuffle for reused scalars.\n"); + VL = std::move(UniqueValues); + } + return true; } - VL = std::move(UniqueValues); - return true; + + // Buildvector/gather of the original scalars. + LLVM_DEBUG(dbgs() << "SLP: Scalar used twice in bundle.\n"); + ReuseShuffleIndices.clear(); + return false; } bool BoUpSLP::canBuildSplitNode(ArrayRef VL, @@ -10479,9 +10542,7 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, SmallPtrSet UOp1(llvm::from_range, Op1); SmallPtrSet UOp2(llvm::from_range, Op2); if (UOp1.size() <= 1 || UOp2.size() <= 1 || - TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask) || - !hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), Op1.size()) || - !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), Op2.size())) + TTI->isLegalAltInstr(VecTy, Opcode0, Opcode1, OpcodeMask)) return false; // Enable split node, only if all nodes are power-of-2/full registers. unsigned Op1Cnt = 0, Op2Cnt = Op1.size(); @@ -10496,6 +10557,12 @@ bool BoUpSLP::canBuildSplitNode(ArrayRef VL, } if (isIdentityOrder(ReorderIndices)) ReorderIndices.clear(); + else if (hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), VL.size()) && + (!hasFullVectorsOrPowerOf2(*TTI, Op1.front()->getType(), + Op1.size()) || + !hasFullVectorsOrPowerOf2(*TTI, Op2.front()->getType(), + Op2.size()))) + return false; SmallVector Mask; if (!ReorderIndices.empty()) inversePermutation(ReorderIndices, Mask); @@ -10961,8 +11028,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( if (S && isa(S.getMainOp()->getParent()->getTerminator())) { LLVM_DEBUG(dbgs() << "SLP: bundle in catchswitch block.\n"); // Do not try to pack to avoid extra instructions here. - return ScalarsVectorizationLegality(S, /*IsLegal=*/false, - /*TryToFindDuplicates=*/false); + return ScalarsVectorizationLegality(S, /*IsLegal=*/false); } // Check if this is a duplicate of another entry. @@ -11011,8 +11077,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( if (!SLPReVec && getValueType(VL.front())->isVectorTy()) { LLVM_DEBUG(dbgs() << "SLP: Gathering due to vector type.\n"); // Do not try to pack to avoid extra instructions here. - return ScalarsVectorizationLegality(S, /*IsLegal=*/false, - /*TryToFindDuplicates=*/false); + return ScalarsVectorizationLegality(S, /*IsLegal=*/false); } // If all of the operands are identical or constant we have a simple solution. @@ -11106,7 +11171,6 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( interleaveComma(VL, dbgs(), [&](Value *V) { dbgs() << *V; }); dbgs() << "]\n"); return ScalarsVectorizationLegality(S, /*IsLegal=*/false, - /*TryToFindDuplicates=*/true, /*TrySplitVectorize=*/true); } LLVM_DEBUG(dbgs() << "SLP: Gathering due to C,S,B,O, small shuffle. \n"; @@ -11123,8 +11187,7 @@ BoUpSLP::ScalarsVectorizationLegality BoUpSLP::getScalarsVectorizationLegality( LLVM_DEBUG(dbgs() << "SLP: The instruction (" << *V << ") is ephemeral.\n"); // Do not try to pack to avoid extra instructions here. - return ScalarsVectorizationLegality(S, /*IsLegal=*/false, - /*TryToFindDuplicates=*/false); + return ScalarsVectorizationLegality(S, /*IsLegal=*/false); } } } @@ -11292,9 +11355,8 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, Legality = getScalarsVectorizationLegality( VL, Depth, UserTreeIdx, /*TryCopyableElementsVectorization=*/true); if (!Legality.isLegal()) { - if (Legality.tryToFindDuplicates()) - tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, - UserTreeIdx); + (void)tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, + UserTreeIdx, *this); newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); return; @@ -11308,7 +11370,7 @@ void BoUpSLP::buildTreeRec(ArrayRef VLRef, unsigned Depth, // Check that every instruction appears once in this bundle. if (!tryToFindDuplicates(VL, ReuseShuffleIndices, *TTI, *TLI, S, UserTreeIdx, - /*TryPad=*/true)) { + *this, /*BuildGatherOnly=*/false)) { newGatherTreeEntry(VL, S, UserTreeIdx, ReuseShuffleIndices); return; } @@ -12820,8 +12882,11 @@ void BoUpSLP::transformNodes() { continue; SmallVector> Slices; bool AllStrided = true; - for (unsigned Cnt = StartIdx; Cnt + VF <= End; Cnt += VF) { - ArrayRef Slice = VL.slice(Cnt, VF); + for (unsigned Cnt = StartIdx; Cnt < End; Cnt += VF) { + const unsigned SliceVF = std::min(VF, End - Cnt); + if (SliceVF <= 1) + continue; + ArrayRef Slice = VL.slice(Cnt, SliceVF); // If any instruction is vectorized already - do not try again. // Reuse the existing node, if it fully matches the slice. if (isVectorized(Slice.front()) && @@ -12841,16 +12906,14 @@ void BoUpSLP::transformNodes() { } if (Slices.empty() || !IsSplat || !IsTwoRegisterSplat || count(Slice, Slice.front()) == - static_cast(isa(Slice.front()) ? VF - 1 + static_cast(isa(Slice.front()) ? SliceVF - 1 : 1)) { if (IsSplat) continue; InstructionsState S = getSameOpcode(Slice, *TLI); if (!S || !allSameOpcode(Slice) || !allSameBlock(Slice) || (S.getOpcode() == Instruction::Load && - areKnownNonVectorizableLoads(Slice)) || - (S.getOpcode() != Instruction::Load && - !hasFullVectorsOrPowerOf2(*TTI, Slice.front()->getType(), VF))) + areKnownNonVectorizableLoads(Slice))) continue; if (VF == 2) { // Try to vectorize reduced values or if all users are vectorized. @@ -16480,6 +16543,8 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl &VL, tryToGatherSingleRegisterExtractElements(SubVL, SubMask); ShufflesRes[Part] = Res; copy(SubMask, std::next(Mask.begin(), Part * SliceSize)); + if (SubVL.size() != SliceSize) + break; } if (none_of(ShufflesRes, [](const std::optional &Res) { return Res.has_value(); @@ -16491,7 +16556,8 @@ BoUpSLP::tryToGatherExtractElements(SmallVectorImpl &VL, std::optional BoUpSLP::isGatherShuffledSingleRegisterEntry( const TreeEntry *TE, ArrayRef VL, MutableArrayRef Mask, - SmallVectorImpl &Entries, unsigned Part, bool ForOrder) { + SmallVectorImpl &Entries, unsigned Part, bool ForOrder, + unsigned SliceSize) { Entries.clear(); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. @@ -16788,8 +16854,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( (*It)->isSame(TE->Scalars)))) { Entries.push_back(*It); if ((*It)->getVectorFactor() == VL.size()) { - std::iota(std::next(Mask.begin(), Part * VL.size()), - std::next(Mask.begin(), (Part + 1) * VL.size()), 0); + std::iota(std::next(Mask.begin(), Part * SliceSize), + std::next(Mask.begin(), Part * SliceSize + VL.size()), 0); } else { SmallVector CommonMask = TE->getCommonMask(); copy(CommonMask, Mask.begin()); @@ -16797,7 +16863,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // Clear undef scalars. for (unsigned I : seq(VL.size())) if (isa(VL[I])) - Mask[Part * VL.size() + I] = PoisonMaskElem; + Mask[Part * SliceSize + I] = PoisonMaskElem; return TargetTransformInfo::SK_PermuteSingleSrc; } // No perfect match, just shuffle, so choose the first tree node from the @@ -16949,8 +17015,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Entries.swap(TempEntries); if (EntryLanes.size() == Entries.size() && !VL.equals(ArrayRef(TE->Scalars) - .slice(Part * VL.size(), - std::min(VL.size(), TE->Scalars.size())))) { + .slice(Part * SliceSize, getNumElems(TE->Scalars.size(), + SliceSize, Part)))) { // We may have here 1 or 2 entries only. If the number of scalars is equal // to the number of entries, no need to do the analysis, it is not very // profitable. Since VL is not the same as TE->Scalars, it means we already @@ -16963,7 +17029,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( // Pair.first is the offset to the vector, while Pair.second is the index of // scalar in the list. for (const std::pair &Pair : EntryLanes) { - unsigned Idx = Part * VL.size() + Pair.second; + unsigned Idx = Part * SliceSize + Pair.second; Mask[Idx] = Pair.first * VF + (ForOrder ? std::distance( @@ -16988,8 +17054,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } else if (!isa(VL.front()->getType()) && (EntryLanes.size() > Entries.size() || VL.size() <= 2)) { // Do the cost estimation if shuffle beneficial than buildvector. - SmallVector SubMask(std::next(Mask.begin(), Part * VL.size()), - std::next(Mask.begin(), (Part + 1) * VL.size())); + SmallVector SubMask( + std::next(Mask.begin(), Part * SliceSize), + std::next(Mask.begin(), Part * SliceSize + VL.size())); int MinElement = SubMask.front(), MaxElement = SubMask.front(); for (int Idx : SubMask) { if (Idx == PoisonMaskElem) @@ -17002,16 +17069,23 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( assert(MaxElement >= 0 && MinElement >= 0 && MaxElement % VF >= MinElement % VF && "Expected at least single element."); - unsigned NewVF = std::max( - VL.size(), getFullVectorNumberOfElements(*TTI, VL.front()->getType(), - (MaxElement % VF) - - (MinElement % VF) + 1)); + unsigned Offset = 0; + unsigned MinIdx = MinElement % VF; + if (MinIdx > 1) { + unsigned MinVF = getFloorFullVectorNumberOfElements( + *TTI, VL.front()->getType(), MinIdx); + auto *VecTy = getWidenedType(VL.front()->getType(), MinVF); + unsigned NumParts = ::getNumberOfParts(*TTI, VecTy, MinVF); + if (NumParts > 1) + Offset = MinVF; + } + unsigned NewVF = + std::max(VL.size(), (MaxElement % VF) - Offset + 1); if (NewVF < VF) { for (int &Idx : SubMask) { if (Idx == PoisonMaskElem) continue; - Idx = ((Idx % VF) - (((MinElement % VF) / NewVF) * NewVF)) % NewVF + - (Idx >= static_cast(VF) ? NewVF : 0); + Idx = (Idx % VF) - Offset + (Idx >= static_cast(VF) ? NewVF : 0); } } else { NewVF = VF; @@ -17091,8 +17165,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( /*Extract=*/false, CostKind); const TreeEntry *BestEntry = nullptr; if (FirstShuffleCost < ShuffleCost) { - std::for_each(std::next(Mask.begin(), Part * VL.size()), - std::next(Mask.begin(), (Part + 1) * VL.size()), + std::for_each(std::next(Mask.begin(), Part * SliceSize), + std::next(Mask.begin(), Part * SliceSize + VL.size()), [&](int &Idx) { if (Idx >= static_cast(VF)) Idx = PoisonMaskElem; @@ -17101,8 +17175,8 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( ShuffleCost = FirstShuffleCost; } if (SecondShuffleCost < ShuffleCost) { - std::for_each(std::next(Mask.begin(), Part * VL.size()), - std::next(Mask.begin(), (Part + 1) * VL.size()), + std::for_each(std::next(Mask.begin(), Part * SliceSize), + std::next(Mask.begin(), Part * SliceSize + VL.size()), [&](int &Idx) { if (Idx < static_cast(VF)) Idx = PoisonMaskElem; @@ -17123,8 +17197,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( } Entries.clear(); // Clear the corresponding mask elements. - std::fill(std::next(Mask.begin(), Part * VL.size()), - std::next(Mask.begin(), (Part + 1) * VL.size()), PoisonMaskElem); + std::fill(std::next(Mask.begin(), Part * SliceSize), + std::next(Mask.begin(), Part * SliceSize + VL.size()), + PoisonMaskElem); return std::nullopt; } @@ -17144,15 +17219,11 @@ BoUpSLP::isGatherShuffledEntry( return !TE->isGather(); }))) return {}; - // FIXME: Gathering for non-power-of-2 (non whole registers) nodes not - // implemented yet. - if (TE->hasNonWholeRegisterOrNonPowerOf2Vec(*TTI)) - return {}; Mask.assign(VL.size(), PoisonMaskElem); assert((TE->UserTreeIndex || TE == VectorizableTree.front().get()) && "Expected only single user of the gather node."); - assert(VL.size() % NumParts == 0 && - "Number of scalars must be divisible by NumParts."); + unsigned PWSz = + getFullVectorNumberOfElements(*TTI, VL.front()->getType(), VL.size()); if (TE->UserTreeIndex && TE->UserTreeIndex.UserTE->isGather() && TE->UserTreeIndex.EdgeIdx == UINT_MAX && (TE->Idx == 0 || @@ -17161,15 +17232,17 @@ BoUpSLP::isGatherShuffledEntry( (TE->hasState() && getSameValuesTreeEntry(TE->getMainOp(), TE->Scalars)))) return {}; - unsigned SliceSize = getPartNumElems(VL.size(), NumParts); + unsigned SliceSize = getPartNumElems(PWSz, NumParts); SmallVector> Res; for (unsigned Part : seq(NumParts)) { + if (Part * SliceSize >= VL.size()) + break; ArrayRef SubVL = VL.slice(Part * SliceSize, getNumElems(VL.size(), SliceSize, Part)); SmallVectorImpl &SubEntries = Entries.emplace_back(); std::optional SubRes = isGatherShuffledSingleRegisterEntry(TE, SubVL, Mask, SubEntries, Part, - ForOrder); + ForOrder, SliceSize); if (!SubRes) SubEntries.clear(); Res.push_back(SubRes); @@ -20503,6 +20576,20 @@ Value *BoUpSLP::vectorizeTree( SI->setCondition(Constant::getNullValue(SI->getCondition()->getType())); } } + for (auto &TEPtr : VectorizableTree) { + TreeEntry *Entry = TEPtr.get(); + if (VectorizableTree.front()->VectorizedValue != Entry->VectorizedValue && + Entry->VectorizedValue && + !Entry->VectorizedValue->getType()->isVoidTy() && + (Entry->Idx != 0 || !Entry->hasState() || + (Entry->getOpcode() != Instruction::InsertElement && + Entry->getOpcode() != Instruction::PHI)) && + Entry->VectorizedValue->use_empty()) { + if (auto *I = dyn_cast(Entry->VectorizedValue)) + RemovedInsts.push_back(I); + } + } + // Retain to-be-deleted instructions for some debug-info bookkeeping and alias // cache correctness. // NOTE: removeInstructionAndOperands only marks the instruction for deletion diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll index 9c615bb4757fa..a1d3f250b8a83 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/InstructionsState-is-invalid-0.ll @@ -7,18 +7,20 @@ target triple = "aarch64-unknown-linux-gnu" define void @foo(ptr %0) { ; CHECK-LABEL: @foo( ; CHECK-NEXT: vector.scevcheck: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x ptr> , ptr [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> splat (i64 4) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, ptr [[TMP0:%.*]], i64 4 +; CHECK-NEXT: [[SCEVGEP3:%.*]] = getelementptr i8, ptr null, i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x ptr> [[TMP1]], ptr [[SCEVGEP]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x ptr> [[TMP2]], <4 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = icmp ult <4 x ptr> [[TMP3]], zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = and <4 x i1> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x ptr> [[TMP6]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x ptr> [[TMP5]], <4 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> [[TMP5]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x ptr> [[TMP12]], <8 x ptr> poison, <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <8 x ptr> [[TMP8]], zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = and <8 x i1> [[TMP9]], zeroinitializer -; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v8i1(<8 x i1> [[TMP10]]) +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[SCEVGEP3]], i32 1 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = icmp ult <4 x ptr> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i1> [[TMP9]], zeroinitializer +; CHECK-NEXT: [[RDX_OP:%.*]] = or <4 x i1> [[TMP5]], [[TMP10]] +; CHECK-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[RDX_OP]]) ; CHECK-NEXT: br i1 [[OP_RDX]], label [[DOTLR_PH:%.*]], label [[VECTOR_PH:%.*]] ; CHECK: vector.ph: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll index 6bea2554c6f96..28d3825c181d5 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/PR38339.ll @@ -3,9 +3,9 @@ define void @f1(<2 x i16> %x, ptr %a) { ; CHECK-LABEL: @f1( -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X:%.*]], <2 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[X]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x i16> [[X:%.*]], i32 0 ; CHECK-NEXT: store i16 [[TMP1]], ptr [[A:%.*]], align 2 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[X]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], ptr undef, align 2 ; CHECK-NEXT: ret void ; @@ -29,9 +29,9 @@ define void @f2(<2 x i16> %x, ptr %a) { ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 0 ; CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], ptr undef, align 2 ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, ptr [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 @@ -70,9 +70,9 @@ define void @f3(<2 x i16> %x, ptr %a) { ; CHECK: cont: ; CHECK-NEXT: [[XX:%.*]] = phi <2 x i16> [ [[X:%.*]], [[ENTRY:%.*]] ], [ undef, [[CONT]] ] ; CHECK-NEXT: [[AA:%.*]] = phi ptr [ [[A:%.*]], [[ENTRY]] ], [ undef, [[CONT]] ] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i16> [[XX]], i32 1 ; CHECK-NEXT: store i16 [[TMP0]], ptr [[A]], align 2 +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x i16> [[XX]], <2 x i16> poison, <4 x i32> ; CHECK-NEXT: store <4 x i16> [[SHUFFLE]], ptr undef, align 2 ; CHECK-NEXT: [[A_VAL:%.*]] = load i16, ptr [[A]], align 2 ; CHECK-NEXT: [[CMP:%.*]] = icmp eq i16 [[A_VAL]], 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll index 8d44d03e0e5cc..b83dda35abfdb 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/alternate-vectorization-split-node.ll @@ -10,9 +10,11 @@ define i32 @test(ptr %c) { ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[BITLEN]], align 8 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <6 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = lshr <6 x i64> [[TMP1]], zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i64> [[TMP0]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP2]], <6 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP7]], <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP8]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i8> ; CHECK-NEXT: store <8 x i8> [[TMP6]], ptr [[INCDEC_PTR_3_1]], align 1 ; CHECK-NEXT: ret i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll index 0f47c6b3ac902..d0e0b8b3657b3 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose-inseltpoison.ll @@ -124,16 +124,16 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <3 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <3 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <3 x i32> [[TMP8]], <3 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP10]] ; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll index 370ed1f258aca..68a7ac119c056 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/transpose.ll @@ -124,16 +124,16 @@ define <4 x i32> @build_vec_v4i32_reuse_0(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @build_vec_v4i32_reuse_1(<2 x i32> %v0, <2 x i32> %v1) { ; CHECK-LABEL: @build_vec_v4i32_reuse_1( -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP6]], [[TMP7]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP4]], <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP11:%.*]] = xor <4 x i32> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[V0:%.*]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[V1:%.*]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <3 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = xor <3 x i32> [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <3 x i32> [[TMP3]], <3 x i32> [[TMP4]], <4 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[V0]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i32> [[V1]], <2 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add <3 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = xor <3 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <3 x i32> [[TMP8]], <3 x i32> [[TMP11]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = sub <4 x i32> [[TMP5]], [[TMP10]] ; CHECK-NEXT: ret <4 x i32> [[TMP9]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index 92027d0043f76..3be08790bc2e4 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -17,12 +17,15 @@ define void @s116_modified(ptr %a) { ; CHECK-LABEL: @s116_modified( -; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 4 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds float, ptr [[GEP1:%.*]], i64 1 +; CHECK-NEXT: [[A:%.*]] = getelementptr inbounds float, ptr [[GEP1]], i64 4 +; CHECK-NEXT: [[LD1:%.*]] = load float, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[LD0:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[GEP1]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[GEP2]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP3]], float [[LD1]], i32 0 ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP2]], float [[LD0]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] ; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[GEP1]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index cbf8bc9dcf8f8..db411f2050ead 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -245,26 +245,17 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 -; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 -; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 -; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = load i8, ptr [[PTR]], align 1 ; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP4]], <16 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP6]], <16 x i8> [[TMP7]], <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP3]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> [[TMP10]], <16 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], splat (i8 -1) -; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i8> [[TMP12]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP14:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP13]] +; CHECK-NEXT: [[TMP3:%.*]] = load <7 x i8>, ptr [[GEP_9]], align 1 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <7 x i8> [[TMP3]], <7 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP2]], <15 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = icmp ugt <15 x i8> [[TMP7]], splat (i8 -1) +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <15 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <15 x i8> [[TMP9]], <15 x i8> poison, <15 x i32> zeroinitializer +; CHECK-NEXT: [[TMP11:%.*]] = select <15 x i1> [[TMP8]], <15 x i8> [[TMP7]], <15 x i8> [[TMP10]] +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <15 x i8> [[TMP11]], <15 x i8> poison, <16 x i32> ; CHECK-NEXT: store <16 x i8> [[TMP14]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll index 645dbc49269f0..05609d7badf49 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/basic-strided-loads.ll @@ -252,7 +252,6 @@ define void @const_stride_2_with_reordering(ptr %pl, ptr %ps) { ; CHECK-NEXT: [[GEP_L0:%.*]] = getelementptr inbounds i8, ptr [[PL]], i64 0 ; CHECK-NEXT: [[GEP_S0:%.*]] = getelementptr inbounds i8, ptr [[PS]], i64 0 ; CHECK-NEXT: [[TMP1:%.*]] = call <31 x i8> @llvm.masked.load.v31i8.p0(ptr [[GEP_L0]], i32 16, <31 x i1> , <31 x i8> poison) -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <31 x i8> [[TMP1]], <31 x i8> poison, <16 x i32> ; CHECK-NEXT: store <16 x i8> [[TMP2]], ptr [[GEP_S0]], align 16 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll index 82c940353ba5a..72a349fe0c585 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/gather-insert-point-restore.ll @@ -5,10 +5,9 @@ define i16 @test(ptr %i) { ; CHECK-LABEL: define i16 @test( ; CHECK-SAME: ptr [[I:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i8, <2 x ptr> [[TMP1]], <2 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x ptr> [[TMP2]], <2 x ptr> poison, <4 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP1]], <4 x i64> ; CHECK-NEXT: [[GEP_US154_2:%.*]] = getelementptr i8, ptr [[I]], i64 142688 ; CHECK-NEXT: br label %[[FOR_COND5_US:.*]] ; CHECK: [[FOR_COND5_US]]: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll index d4e323819402c..db1370efb8133 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reordered-buildvector-scalars.ll @@ -14,85 +14,38 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72 ; CHECK-NEXT: [[LOOPARRAY_SROA_24_0_I_I3:%.*]] = ashr i32 [[TMP0]], 1 ; CHECK-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1 ; CHECK-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312:%.*]] = extractelement <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], i64 0 ; CHECK-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1 -; CHECK-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]] -; CHECK-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1 -; CHECK-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16 -; CHECK-NEXT: [[ADD2157_I:%.*]] = add i32 [[PREDPEL_I_SROA_86_80_VEC_EXTRACT59312]], 1 -; CHECK-NEXT: [[SHR2158_I:%.*]] = lshr i32 [[ADD2157_I]], 1 -; CHECK-NEXT: [[CONV2159_I:%.*]] = trunc i32 [[SHR2158_I]] to i16 -; CHECK-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2 -; CHECK-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2 -; CHECK-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16 -; CHECK-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1 -; CHECK-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]] -; CHECK-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16 -; CHECK-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1 -; CHECK-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]] -; CHECK-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16 ; CHECK-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1 -; CHECK-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1 -; CHECK-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1 -; CHECK-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 -; CHECK-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8 -; CHECK-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]] -; CHECK-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1 -; CHECK-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 -; CHECK-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8 -; CHECK-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4 -; CHECK-NEXT: [[ADD2280_I:%.*]] = add i32 [[ADD111_I_I]], 1 -; CHECK-NEXT: [[SHR2281_I:%.*]] = lshr i32 [[ADD2280_I]], 1 -; CHECK-NEXT: [[CONV2282_I:%.*]] = trunc i32 [[SHR2281_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 -; CHECK-NEXT: store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8 -; CHECK-NEXT: store i16 [[CONV2282_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4 -; CHECK-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1 -; CHECK-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 -; CHECK-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 -; CHECK-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8 ; CHECK-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1 -; CHECK-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]] -; CHECK-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1 -; CHECK-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 -; CHECK-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8 -; CHECK-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1 -; CHECK-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1 -; CHECK-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8 -; CHECK-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1 -; CHECK-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]] -; CHECK-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) -; CHECK-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> -; CHECK-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 -; CHECK-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1 -; CHECK-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]] -; CHECK-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 -; CHECK-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2 -; CHECK-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 -; CHECK-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2 -; CHECK-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2 -; CHECK-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2 -; CHECK-NEXT: store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8230), align 2 -; CHECK-NEXT: store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8210), align 2 -; CHECK-NEXT: store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8190), align 2 -; CHECK-NEXT: store i16 [[CONV2159_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 -; CHECK-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2 -; CHECK-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2 -; CHECK-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2 -; CHECK-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2 -; CHECK-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2 -; CHECK-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[ADD111_I_I]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[ADD2323_I]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x i32> poison, i32 [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <3 x i32> [[TMP5]], <3 x i32> poison, <3 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = or <3 x i32> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> [[TMP1]], <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <3 x i32> [[TMP4]], <3 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP0]], i32 4 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[ADD2235_I16]], i32 1 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[MUL1445_I]], i32 5 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[SHR143_5_I_I9]], i32 7 +; CHECK-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <11 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <3 x i32> [[TMP7]], <3 x i32> poison, <11 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <11 x i32> [[TMP16]], <11 x i32> [[TMP17]], <11 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = lshr <11 x i32> [[TMP18]], +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[ADD1392_I]], i32 1 +; CHECK-NEXT: [[TMP22:%.*]] = or <4 x i32> [[TMP21]], splat (i32 1) +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = add <4 x i32> [[TMP22]], [[TMP23]] +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <11 x i32> [[TMP19]], <11 x i32> poison, <15 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <15 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <15 x i32> [[TMP25]], <15 x i32> [[TMP26]], <15 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = trunc <15 x i32> [[TMP27]] to <15 x i16> +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <15 x i16> [[TMP28]], <15 x i16> poison, <32 x i32> +; CHECK-NEXT: store <32 x i16> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 ; CHECK-NEXT: ret i32 0 ; ; THRESH-LABEL: define fastcc i32 @test( @@ -102,80 +55,37 @@ define fastcc i32 @test(i32 %0, i32 %add111.i.i, <4 x i32> %PredPel.i.sroa.86.72 ; THRESH-NEXT: [[SHR143_5_I_I9:%.*]] = ashr i32 [[TMP0]], 1 ; THRESH-NEXT: [[ADD1392_I:%.*]] = add i32 [[TMP0]], 1 ; THRESH-NEXT: [[MUL1445_I:%.*]] = shl i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2136_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], [[TMP0]] -; THRESH-NEXT: [[SHR2137_I:%.*]] = lshr i32 [[ADD2136_I]], 1 -; THRESH-NEXT: [[CONV2138_I:%.*]] = trunc i32 [[SHR2137_I]] to i16 -; THRESH-NEXT: [[ADD2174_I:%.*]] = add i32 [[MUL1445_I]], 2 -; THRESH-NEXT: [[SHR2175_I:%.*]] = lshr i32 [[ADD2174_I]], 2 -; THRESH-NEXT: [[CONV2176_I:%.*]] = trunc i32 [[SHR2175_I]] to i16 -; THRESH-NEXT: [[ADD2190_I:%.*]] = or i32 [[ADD1392_I]], 1 -; THRESH-NEXT: [[ADD2191_I:%.*]] = add i32 [[ADD2190_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2193_I:%.*]] = trunc i32 [[ADD2191_I]] to i16 -; THRESH-NEXT: [[ADD2203_I:%.*]] = or i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2204_I:%.*]] = add i32 [[ADD2203_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2206_I:%.*]] = trunc i32 [[ADD2204_I]] to i16 ; THRESH-NEXT: [[ADD2235_I16:%.*]] = or i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2236_I:%.*]] = add i32 [[ADD2235_I16]], 1 -; THRESH-NEXT: [[SHR2237_I:%.*]] = lshr i32 [[ADD2236_I]], 1 -; THRESH-NEXT: [[CONV2238_I:%.*]] = trunc i32 [[SHR2237_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8196), align 4 -; THRESH-NEXT: store i16 [[CONV2238_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8176), align 8 -; THRESH-NEXT: [[ADD2258_I:%.*]] = or i32 [[ADD111_I_I]], [[TMP0]] -; THRESH-NEXT: [[SHR2259_I:%.*]] = lshr i32 [[ADD2258_I]], 1 -; THRESH-NEXT: [[CONV2260_I:%.*]] = trunc i32 [[SHR2259_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8212), align 4 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8192), align 8 -; THRESH-NEXT: store i16 [[CONV2260_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8172), align 4 -; THRESH-NEXT: [[ADD2302_I:%.*]] = add i32 [[TMP0]], 1 -; THRESH-NEXT: [[SHR2303_I:%.*]] = lshr i32 [[ADD2302_I]], 1 -; THRESH-NEXT: [[CONV2304_I:%.*]] = trunc i32 [[SHR2303_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8224), align 8 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8204), align 4 -; THRESH-NEXT: store i16 [[CONV2304_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8184), align 8 ; THRESH-NEXT: [[ADD2323_I:%.*]] = add i32 [[TMP0]], 1 -; THRESH-NEXT: [[ADD2324_I:%.*]] = or i32 [[ADD2323_I]], [[TMP0]] -; THRESH-NEXT: [[SHR2325_I:%.*]] = lshr i32 [[ADD2324_I]], 1 -; THRESH-NEXT: [[CONV2326_I:%.*]] = trunc i32 [[SHR2325_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8220), align 4 -; THRESH-NEXT: store i16 [[CONV2326_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8200), align 8 -; THRESH-NEXT: [[ADD2342_I:%.*]] = add i32 [[SHR143_5_I_I9]], 1 -; THRESH-NEXT: [[SHR2343_I:%.*]] = lshr i32 [[ADD2342_I]], 1 -; THRESH-NEXT: [[CONV2344_I:%.*]] = trunc i32 [[SHR2343_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2344_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8216), align 8 -; THRESH-NEXT: [[ADD2355_I:%.*]] = or i32 [[SHR143_5_I_I9]], 1 -; THRESH-NEXT: [[ADD2356_I:%.*]] = add i32 [[ADD2355_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2358_I:%.*]] = trunc i32 [[ADD2356_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2358_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8232), align 8 -; THRESH-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <2 x i32> -; THRESH-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 0 -; THRESH-NEXT: [[TMP4:%.*]] = add <2 x i32> [[TMP3]], splat (i32 1) -; THRESH-NEXT: [[TMP5:%.*]] = lshr <2 x i32> [[TMP4]], splat (i32 1) -; THRESH-NEXT: [[TMP6:%.*]] = trunc <2 x i32> [[TMP5]] to <2 x i16> -; THRESH-NEXT: store <2 x i16> [[TMP6]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8180), align 4 -; THRESH-NEXT: [[ADD2393_I:%.*]] = or i32 [[LOOPARRAY_SROA_24_0_I_I3]], 1 -; THRESH-NEXT: [[ADD2394_I:%.*]] = add i32 [[ADD2393_I]], [[TMP0]] -; THRESH-NEXT: [[CONV2396_I:%.*]] = trunc i32 [[ADD2394_I]] to i16 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8198), align 2 -; THRESH-NEXT: store i16 [[CONV2396_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8178), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8214), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8194), align 2 -; THRESH-NEXT: store i16 [[CONV2138_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8174), align 2 -; THRESH-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> poison, <2 x i32> -; THRESH-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[ADD111_I_I]], i32 0 -; THRESH-NEXT: [[TMP9:%.*]] = add <2 x i32> [[TMP8]], splat (i32 1) -; THRESH-NEXT: [[TMP10:%.*]] = lshr <2 x i32> [[TMP9]], splat (i32 1) -; THRESH-NEXT: [[TMP11:%.*]] = trunc <2 x i32> [[TMP10]] to <2 x i16> -; THRESH-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP11]], i32 1 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8228), align 4 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8208), align 8 -; THRESH-NEXT: store <2 x i16> [[TMP11]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8188), align 4 -; THRESH-NEXT: store i16 [[TMP12]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8226), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8206), align 2 -; THRESH-NEXT: store i16 [[CONV2176_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8186), align 2 -; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8222), align 2 -; THRESH-NEXT: store i16 [[CONV2193_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8202), align 2 -; THRESH-NEXT: store i16 [[CONV2206_I]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8218), align 2 +; THRESH-NEXT: [[TMP2:%.*]] = insertelement <3 x i32> poison, i32 [[ADD111_I_I]], i32 0 +; THRESH-NEXT: [[TMP3:%.*]] = insertelement <3 x i32> [[TMP2]], i32 [[LOOPARRAY_SROA_24_0_I_I3]], i32 1 +; THRESH-NEXT: [[TMP4:%.*]] = insertelement <3 x i32> [[TMP3]], i32 [[ADD2323_I]], i32 2 +; THRESH-NEXT: [[TMP5:%.*]] = insertelement <3 x i32> poison, i32 [[TMP0]], i32 0 +; THRESH-NEXT: [[TMP6:%.*]] = shufflevector <3 x i32> [[TMP5]], <3 x i32> poison, <3 x i32> zeroinitializer +; THRESH-NEXT: [[TMP7:%.*]] = or <3 x i32> [[TMP4]], [[TMP6]] +; THRESH-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[PREDPEL_I_SROA_86_72_VEC_EXTRACT]], <4 x i32> [[TMP1]], <8 x i32> +; THRESH-NEXT: [[TMP9:%.*]] = shufflevector <3 x i32> [[TMP4]], <3 x i32> poison, <8 x i32> +; THRESH-NEXT: [[TMP10:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> [[TMP9]], <8 x i32> +; THRESH-NEXT: [[TMP11:%.*]] = insertelement <8 x i32> [[TMP10]], i32 [[TMP0]], i32 4 +; THRESH-NEXT: [[TMP12:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[ADD2235_I16]], i32 1 +; THRESH-NEXT: [[TMP13:%.*]] = insertelement <8 x i32> [[TMP12]], i32 [[MUL1445_I]], i32 5 +; THRESH-NEXT: [[TMP14:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[SHR143_5_I_I9]], i32 7 +; THRESH-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], +; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <8 x i32> [[TMP15]], <8 x i32> poison, <11 x i32> +; THRESH-NEXT: [[TMP17:%.*]] = shufflevector <3 x i32> [[TMP7]], <3 x i32> poison, <11 x i32> +; THRESH-NEXT: [[TMP18:%.*]] = shufflevector <11 x i32> [[TMP16]], <11 x i32> [[TMP17]], <11 x i32> +; THRESH-NEXT: [[TMP19:%.*]] = lshr <11 x i32> [[TMP18]], +; THRESH-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP21:%.*]] = insertelement <4 x i32> [[TMP20]], i32 [[ADD1392_I]], i32 1 +; THRESH-NEXT: [[TMP22:%.*]] = or <4 x i32> [[TMP21]], splat (i32 1) +; THRESH-NEXT: [[TMP23:%.*]] = shufflevector <8 x i32> [[TMP14]], <8 x i32> poison, <4 x i32> +; THRESH-NEXT: [[TMP24:%.*]] = add <4 x i32> [[TMP22]], [[TMP23]] +; THRESH-NEXT: [[TMP25:%.*]] = shufflevector <11 x i32> [[TMP19]], <11 x i32> poison, <15 x i32> +; THRESH-NEXT: [[TMP26:%.*]] = shufflevector <4 x i32> [[TMP24]], <4 x i32> poison, <15 x i32> +; THRESH-NEXT: [[TMP27:%.*]] = shufflevector <15 x i32> [[TMP25]], <15 x i32> [[TMP26]], <15 x i32> +; THRESH-NEXT: [[TMP28:%.*]] = trunc <15 x i32> [[TMP27]] to <15 x i16> +; THRESH-NEXT: [[TMP29:%.*]] = shufflevector <15 x i16> [[TMP28]], <15 x i16> poison, <32 x i32> +; THRESH-NEXT: store <32 x i16> [[TMP29]], ptr getelementptr inbounds nuw (i8, ptr @images, i64 8170), align 2 ; THRESH-NEXT: ret i32 0 ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll index 8db3a8b6ff219..8b409851d1eb1 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/smin-signed-zextended.ll @@ -5,9 +5,9 @@ define <4 x i32> @test(i16 %0, i16 %1) { ; CHECK-LABEL: define <4 x i32> @test( ; CHECK-SAME: i16 [[TMP0:%.*]], i16 [[TMP1:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i16> , i16 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = zext <2 x i16> [[TMP2]] to <2 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i16> , i16 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> ; CHECK-NEXT: [[CONV15_I:%.*]] = sext i16 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], splat (i32 -1) ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> , i32 [[CONV15_I]], i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll index 510cf45edbb52..01a279890ea24 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/unordered-loads-operands.ll @@ -8,32 +8,30 @@ define void @test(ptr %mdct_forward_x) { ; CHECK-NEXT: br label %[[FOR_COND:.*]] ; CHECK: [[FOR_COND]]: ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[MDCT_FORWARD_X]], align 8 -; CHECK-NEXT: [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32 -; CHECK-NEXT: [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 ; CHECK-NEXT: [[ADD_PTR_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 24 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x ptr> [[TMP1]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <4 x ptr> [[TMP2]], <4 x i64> -; CHECK-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ADD_PTR_I]], i32 4, <3 x i1> , <3 x float> poison) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <2 x i32> +; CHECK-NEXT: [[ARRAYIDX5_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 40 +; CHECK-NEXT: [[ARRAYIDX10_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 28 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <3 x ptr> poison, ptr [[TMP0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x ptr> [[TMP1]], <3 x ptr> poison, <3 x i32> zeroinitializer +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr i8, <3 x ptr> [[TMP2]], <3 x i64> +; CHECK-NEXT: [[ARRAYIDX2_I_I:%.*]] = getelementptr i8, ptr [[TMP0]], i64 32 ; CHECK-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.masked.load.v3f32.p0(ptr [[ARRAYIDX5_I_I]], i32 4, <3 x i1> , <3 x float> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p0(<4 x ptr> [[TMP3]], i32 4, <4 x i1> splat (i1 true), <4 x float> poison) -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <3 x float> [[TMP5]], <3 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x float> , <4 x float> [[TMP22]], <4 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = fsub <4 x float> [[TMP9]], [[TMP12]] -; CHECK-NEXT: [[TMP14:%.*]] = fadd <4 x float> [[TMP9]], [[TMP12]] -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = fsub <4 x float> zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP17:%.*]] = fadd <4 x float> zeroinitializer, [[TMP8]] -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <4 x float> [[TMP16]], <4 x float> [[TMP17]], <4 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = call <2 x float> @llvm.experimental.vp.strided.load.v2f32.p0.i64(ptr align 4 [[ARRAYIDX10_I_I]], i64 -4, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP20]], <2 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <3 x float> , <3 x float> [[TMP6]], <3 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = fsub <3 x float> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = fadd <3 x float> [[TMP8]], [[TMP7]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <3 x float> [[TMP9]], <3 x float> [[TMP10]], <3 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = call <3 x float> @llvm.masked.gather.v3f32.v3p0(<3 x ptr> [[TMP3]], i32 4, <3 x i1> splat (i1 true), <3 x float> poison) +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <3 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <3 x float> , <3 x float> [[TMP6]], <3 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = fsub <3 x float> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[TMP16:%.*]] = fadd <3 x float> [[TMP14]], [[TMP12]] +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <3 x float> [[TMP15]], <3 x float> [[TMP16]], <3 x i32> ; CHECK-NEXT: store float 0.000000e+00, ptr [[ADD_PTR_I]], align 4 -; CHECK-NEXT: [[TMP19:%.*]] = fsub <4 x float> [[TMP15]], [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = fadd <4 x float> [[TMP15]], [[TMP18]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP19]], <4 x float> [[TMP20]], <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = fsub <3 x float> [[TMP17]], [[TMP11]] +; CHECK-NEXT: [[TMP19:%.*]] = fadd <3 x float> [[TMP17]], [[TMP11]] +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <3 x float> [[TMP18]], <3 x float> [[TMP19]], <4 x i32> ; CHECK-NEXT: store <4 x float> [[TMP21]], ptr [[ARRAYIDX2_I_I]], align 4 ; CHECK-NEXT: br label %[[FOR_COND]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll index dbeff25954085..64de6033a4875 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reuse-non-power-of-2-reorder.ll @@ -4,23 +4,23 @@ define void @test(i32 %0, i64 %1, i32 %2, i32 %3, ptr %4) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[TMP0:%.*]], i64 [[TMP1:%.*]], i32 [[TMP2:%.*]], i32 [[TMP3:%.*]], ptr [[TMP4:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: [[TMP56:%.*]] = trunc i64 [[TMP1]] to i32 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP56]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP73:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> poison, i32 [[TMP73]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP98:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP1]] to i32 ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <60 x i32> , i32 [[TMP0]], i32 7 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <60 x i32> [[TMP11]], <60 x i32> poison, <60 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <60 x i32> poison, i32 [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP14:%.*]] = insertelement <60 x i32> [[TMP13]], i32 [[TMP2]], i32 7 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <60 x i32> [[TMP14]], i32 [[TMP98]], i32 0 -; CHECK-NEXT: [[TMP16:%.*]] = insertelement <60 x i32> [[TMP15]], i32 [[TMP73]], i32 6 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <60 x i32> [[TMP14]], i32 [[TMP10]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <60 x i32> [[TMP15]], i32 [[TMP98]], i32 6 ; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <60 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> [[TMP22]], <8 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> [[TMP18]], <8 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP19]], <8 x i32> poison, <60 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x i32> [[TMP8]], <2 x i32> poison, <60 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <60 x i32> [[TMP16]], <60 x i32> [[TMP19]], <8 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x i32> [[TMP22]], <8 x i32> [[TMP18]], <8 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x i32> [[TMP23]], <8 x i32> poison, <60 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = xor <60 x i32> [[TMP12]], [[TMP20]] ; CHECK-NEXT: [[TMP130:%.*]] = call i32 @llvm.vector.reduce.or.v60i32(<60 x i32> [[TMP21]]) ; CHECK-NEXT: store i32 [[TMP130]], ptr [[TMP4]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll index 143e09374a891..c083e9e551441 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll @@ -8,15 +8,15 @@ define void @Test(i32) { ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: ; CHECK-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = phi <8 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] ; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]] ; CHECK-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 -; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> poison, i32 [[OP_RDX1]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VAL_43]], i32 1 +; CHECK-NEXT: [[TMP6]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: br label [[LOOP]] ; ; FORCE_REDUCTION-LABEL: @Test( @@ -24,15 +24,15 @@ define void @Test(i32) { ; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]] ; FORCE_REDUCTION: loop: ; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ] -; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] -; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> +; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = phi <8 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ] ; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], ; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]]) ; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]] ; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]] ; FORCE_REDUCTION-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910 -; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0 -; FORCE_REDUCTION-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> poison, i32 [[OP_RDX1]], i32 0 +; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[VAL_43]], i32 1 +; FORCE_REDUCTION-NEXT: [[TMP6]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> ; FORCE_REDUCTION-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll index 194c7021f60f5..a4c1d1bd98f8c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/PR40310.ll @@ -4,20 +4,20 @@ define void @mainTest(i32 %param, ptr %vals, i32 %len) { ; CHECK-LABEL: @mainTest( ; CHECK-NEXT: bci_15.preheader: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i32> , i32 [[PARAM:%.*]], i32 0 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <16 x i32> , i32 [[PARAM:%.*]], i32 0 ; CHECK-NEXT: br label [[BCI_15:%.*]] ; CHECK: bci_15: ; CHECK-NEXT: [[LOCAL_0_:%.*]] = phi i32 [ [[OP_RDX:%.*]], [[BCI_15]] ], [ [[PARAM]], [[BCI_15_PREHEADER:%.*]] ] ; CHECK-NEXT: [[LOCAL_4_:%.*]] = phi i32 [ [[V44:%.*]], [[BCI_15]] ], [ 31, [[BCI_15_PREHEADER]] ] -; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = phi <16 x i32> [ [[TMP6:%.*]], [[BCI_15]] ], [ [[TMP0]], [[BCI_15_PREHEADER]] ] ; CHECK-NEXT: [[TMP3:%.*]] = add <16 x i32> [[TMP2]], ; CHECK-NEXT: store atomic i32 [[LOCAL_0_]], ptr [[VALS:%.*]] unordered, align 4 ; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v16i32(<16 x i32> [[TMP3]]) ; CHECK-NEXT: [[OP_RDX]] = and i32 [[TMP4]], [[LOCAL_4_]] ; CHECK-NEXT: [[V44]] = add i32 [[LOCAL_4_]], 16 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX]], i32 0 -; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[V44]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x i32> poison, i32 [[OP_RDX]], i32 0 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i32> [[TMP7]], i32 [[V44]], i32 1 +; CHECK-NEXT: [[TMP6]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: br i1 true, label [[BCI_15]], label [[LOOPEXIT:%.*]] ; CHECK: loopexit: ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll index e1ee35217d187..2ac19ffc564d3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bool-mask.ll @@ -21,13 +21,9 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) { ; SSE-NEXT: [[TOBOOL_NOT:%.*]] = icmp ne i8 [[TMP0]], 0 ; SSE-NEXT: [[OR:%.*]] = zext i1 [[TOBOOL_NOT]] to i64 ; SSE-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 1 -; SSE-NEXT: [[TMP1:%.*]] = load <8 x i8>, ptr [[ARRAYIDX_1]], align 1 -; SSE-NEXT: [[TMP2:%.*]] = icmp eq <8 x i8> [[TMP1]], zeroinitializer -; SSE-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x i64> zeroinitializer, <8 x i64> -; SSE-NEXT: [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 9 -; SSE-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ARRAYIDX_9]], align 1 -; SSE-NEXT: [[TMP5:%.*]] = icmp eq <4 x i8> [[TMP4]], zeroinitializer -; SSE-NEXT: [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i64> zeroinitializer, <4 x i64> +; SSE-NEXT: [[TMP1:%.*]] = load <12 x i8>, ptr [[ARRAYIDX_1]], align 1 +; SSE-NEXT: [[TMP2:%.*]] = icmp eq <12 x i8> [[TMP1]], zeroinitializer +; SSE-NEXT: [[TMP3:%.*]] = select <12 x i1> [[TMP2]], <12 x i64> zeroinitializer, <12 x i64> ; SSE-NEXT: [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 13 ; SSE-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX_13]], align 1 ; SSE-NEXT: [[TOBOOL_NOT_13:%.*]] = icmp eq i8 [[TMP7]], 0 @@ -40,11 +36,7 @@ define i64 @bitmask_16xi8(ptr nocapture noundef readonly %src) { ; SSE-NEXT: [[TMP9:%.*]] = load i8, ptr [[ARRAYIDX_15]], align 1 ; SSE-NEXT: [[TOBOOL_NOT_15:%.*]] = icmp eq i8 [[TMP9]], 0 ; SSE-NEXT: [[OR_15:%.*]] = select i1 [[TOBOOL_NOT_15]], i64 0, i64 32768 -; SSE-NEXT: [[TMP10:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> poison, <4 x i32> -; SSE-NEXT: [[RDX_OP:%.*]] = or <4 x i64> [[TMP10]], [[TMP6]] -; SSE-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[RDX_OP]], <4 x i64> poison, <8 x i32> -; SSE-NEXT: [[TMP11:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP12]], <8 x i32> -; SSE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v8i64(<8 x i64> [[TMP11]]) +; SSE-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.or.v12i64(<12 x i64> [[TMP3]]) ; SSE-NEXT: [[OP_RDX:%.*]] = or i64 [[TMP16]], [[OR_13]] ; SSE-NEXT: [[OP_RDX5:%.*]] = or i64 [[OR_14]], [[OR_15]] ; SSE-NEXT: [[OP_RDX6:%.*]] = or i64 [[OP_RDX]], [[OP_RDX5]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll index c4ddc5d63cc04..8a9ea6886bda1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/bv-shuffle-mask.ll @@ -8,9 +8,7 @@ define i16 @test(i16 %v1, i16 %v2) { ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i16> , i16 [[V2]], i32 3 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i16> , i16 [[V1]], i32 3 ; CHECK-NEXT: [[TMP3:%.*]] = and <4 x i16> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i16> [[TMP5]], i16 [[V1]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> [[TMP0]], <4 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i16> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = and <4 x i16> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP10:%.*]] = and <4 x i16> [[TMP9]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll index 7510b8fb83e34..83c0c028a2170 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/cse.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/cse.ll @@ -16,8 +16,9 @@ define i32 @test(ptr nocapture %G) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds double, ptr [[G:%.*]], i64 5 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[ARRAYIDX]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fmul <4 x double> [[TMP1]], +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> poison, <3 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fmul <3 x double> [[TMP1]], +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x double> [[TMP4]], <3 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], ; CHECK-NEXT: store <4 x double> [[TMP3]], ptr [[G]], align 8 ; CHECK-NEXT: ret i32 undef @@ -283,7 +284,7 @@ return: ; preds = %entry, %if.end define void @PR19646(ptr %this, i1 %arg) { ; CHECK-LABEL: @PR19646( ; CHECK-NEXT: entry: -; CHECK-NEXT: br i1 %arg, label [[IF_END13:%.*]], label [[IF_END13]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[IF_END13:%.*]], label [[IF_END13]] ; CHECK: sw.epilog7: ; CHECK-NEXT: [[DOTIN:%.*]] = getelementptr inbounds [[CLASS_B_53_55:%.*]], ptr [[THIS:%.*]], i64 0, i32 0, i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[DOTIN]], align 8 @@ -294,7 +295,7 @@ define void @PR19646(ptr %this, i1 %arg) { ; CHECK-NEXT: [[_DY:%.*]] = getelementptr inbounds [[CLASS_B_53_55]], ptr [[THIS]], i64 0, i32 0, i32 2 ; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[_DY]], align 8 ; CHECK-NEXT: [[ADD10:%.*]] = fadd double [[ADD8]], [[TMP2]] -; CHECK-NEXT: br i1 %arg, label [[IF_THEN12:%.*]], label [[IF_END13]] +; CHECK-NEXT: br i1 [[ARG]], label [[IF_THEN12:%.*]], label [[IF_END13]] ; CHECK: if.then12: ; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr undef, align 8 ; CHECK-NEXT: br label [[IF_END13]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll index 07ee8f840721f..8df459a9a22b7 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/div-possibly-extended-with-poisons.ll @@ -11,20 +11,24 @@ define i8 @test(ptr %g_127, i32 %0, i16 %1) { ; CHECK: [[FOR_INC434_I]]: ; CHECK-NEXT: [[TMP2:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ 60, %[[FOR_COND166_PREHEADER_I]] ] ; CHECK-NEXT: [[CONV8_I_I:%.*]] = zext nneg i32 [[TMP0]] to i64 -; CHECK-NEXT: [[DIV_I_I_1:%.*]] = udiv i64 [[CONV8_I_I]], [[TMP2]] -; CHECK-NEXT: [[TMP3:%.*]] = trunc i64 [[DIV_I_I_1]] to i16 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> poison, i16 [[TMP3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> poison, i64 [[CONV8_I_I]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP9:%.*]] = udiv <4 x i64> [[TMP6]], [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i16> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i16> [[TMP10]], <4 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> [[TMP11]], <8 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x i16> [[TMP12]], <8 x i16> poison, <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> [[TMP13]]) -; CHECK-NEXT: [[TMP15:%.*]] = call i16 @llvm.vector.reduce.and.v8i16(<8 x i16> [[TMP14]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> poison, i64 [[CONV8_I_I]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <3 x i64> [[TMP3]], <3 x i64> poison, <3 x i32> zeroinitializer +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <3 x i64> poison, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <3 x i64> [[TMP5]], <3 x i64> poison, <3 x i32> zeroinitializer +; CHECK-NEXT: [[TMP7:%.*]] = udiv <3 x i64> [[TMP4]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = trunc <3 x i64> [[TMP7]] to <3 x i16> +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <3 x i16> [[TMP8]], <3 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x i64> poison, i64 [[CONV8_I_I]], i32 0 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i32 0 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP19:%.*]] = udiv <2 x i64> [[TMP12]], [[TMP14]] +; CHECK-NEXT: [[TMP16:%.*]] = trunc <2 x i64> [[TMP19]] to <2 x i16> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> [[TMP17]]) +; CHECK-NEXT: [[RDX_OP:%.*]] = and <4 x i16> [[TMP10]], [[TMP18]] +; CHECK-NEXT: [[TMP15:%.*]] = call i16 @llvm.vector.reduce.and.v4i16(<4 x i16> [[RDX_OP]]) ; CHECK-NEXT: [[OP_RDX:%.*]] = and i16 [[TMP15]], [[TMP1]] ; CHECK-NEXT: [[AND14_I_2_I_5:%.*]] = zext i16 [[OP_RDX]] to i32 ; CHECK-NEXT: store i32 [[AND14_I_2_I_5]], ptr [[G_127]], align 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll index 73b73735da021..75f38c5d0807c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-multi-register-use.ll @@ -11,14 +11,12 @@ define void @test(double %i) { ; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> zeroinitializer, [[TMP2]] ; CHECK-NEXT: [[I75:%.*]] = fsub double 0.000000e+00, [[I]] ; CHECK-NEXT: [[TMP5:%.*]] = fsub <2 x double> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP5]], <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <4 x double> [[TMP6]], <4 x double> [[TMP7]], <8 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x double> [[TMP28]], <8 x double> , <8 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[I75]], i32 3 -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x double> [[TMP9]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x double> zeroinitializer, [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x double> zeroinitializer, [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP1]], <6 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP10]], <6 x double> , <6 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x double> [[TMP6]], double [[I75]], i32 3 +; CHECK-NEXT: [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = fadd <6 x double> zeroinitializer, [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x double> [[TMP9]], <6 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> [[TMP12]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = fcmp ult <8 x double> [[TMP13]], zeroinitializer ; CHECK-NEXT: br label [[BB116:%.*]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll index 6942df532ae29..9655cc3d89d3c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -9,12 +9,13 @@ define void @foo(double %i) { ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[I82:%.*]] = fsub double 0.000000e+00, poison ; CHECK-NEXT: [[I103:%.*]] = fsub double 0.000000e+00, [[I]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> , <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x double> [[TMP6]], <8 x double> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = fmul <8 x double> , [[TMP7]] -; CHECK-NEXT: [[TMP13:%.*]] = fadd <8 x double> zeroinitializer, [[TMP12]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <5 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <5 x double> [[TMP2]], <5 x double> , <5 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <5 x double> [[TMP3]], double [[I82]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = fmul <5 x double> , [[TMP4]] +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <5 x double> [[TMP5]], <5 x double> poison, <6 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = fadd <6 x double> [[TMP6]], zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x double> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = fcmp ult <8 x double> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[TMP16:%.*]] = freeze <8 x i1> [[TMP15]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll index 7bbc694dc5181..51ee894894081 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelements-vector-ops-shuffle.ll @@ -10,7 +10,7 @@ define double @test() { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr getelementptr inbounds ([13 x double], ptr null, i64 0, i64 8), align 16 ; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x double> , double [[TMP3]], i32 2 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP17]], double [[TMP2]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = fmul <4 x double> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = call reassoc nsz double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP5]]) ; CHECK-NEXT: [[TMP7:%.*]] = fmul double [[TMP6]], 0.000000e+00 ; CHECK-NEXT: store double [[TMP7]], ptr null, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll index 18e03df0fbcc9..132865da252c9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelemets-extended-by-poison.ll @@ -7,14 +7,14 @@ define i32 @test() { ; CHECK-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr null, align 16 ; CHECK-NEXT: [[TMP12:%.*]] = extractelement <4 x i64> [[TMP0]], i32 1 ; CHECK-NEXT: [[TMP13:%.*]] = or i64 [[TMP12]], 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> , <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[TMP4]], <8 x i64> [[TMP3]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = trunc <8 x i64> [[TMP5]] to <8 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[TMP6]], <8 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = trunc <8 x i64> [[TMP1]] to <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <6 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP3]], <6 x i64> , <6 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x i64> [[TMP4]], <6 x i64> [[TMP3]], <6 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = trunc <6 x i64> [[TMP5]] to <6 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x i32> [[TMP6]], <6 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <6 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = trunc <6 x i64> [[TMP10]] to <6 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <6 x i32> [[TMP11]], <6 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = add <8 x i32> [[TMP14]], zeroinitializer ; CHECK-NEXT: [[TMP8:%.*]] = add <16 x i32> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = extractelement <4 x i64> [[TMP0]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll index 5e3d4715e99c5..7455fedf0762c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-match-with-poison-scalar.ll @@ -7,10 +7,10 @@ define i32 @test() { ; CHECK-NEXT: br label %[[FUNC_135_EXIT_I:.*]] ; CHECK: [[FUNC_135_EXIT_I]]: ; CHECK-NEXT: [[G_228_PROMOTED166_I1105_I:%.*]] = phi i32 [ 0, %[[ENTRY]] ] -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> , [[TMP1]] -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <8 x i32> [[TMP2]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <5 x i32> , i32 [[G_228_PROMOTED166_I1105_I]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <5 x i32> [[TMP0]], <5 x i32> poison, <5 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <5 x i32> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <5 x i32> [[TMP2]], <5 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = icmp ugt <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = icmp ult <16 x i32> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP12]], <16 x i1> [[TMP13]], <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll index ce65f532e0b3b..e1868a26b4127 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/full-matched-bv-with-subvectors.ll @@ -6,42 +6,38 @@ define i32 @test(i64 %l.549) { ; CHECK-SAME: i64 [[L_549:%.*]]) { ; CHECK-NEXT: [[ENTRY:.*]]: ; CHECK-NEXT: [[CONV3:%.*]] = sext i32 0 to i64 -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> [[TMP3]], i64 0, i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i64> poison, i64 [[CONV3]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i64> poison, i64 [[L_549]], i32 0 ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP2]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: br label %[[IF_THEN19:.*]] ; CHECK: [[P:.*]]: -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP13:%.*]], %[[IF_END25:.*]] ] -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP5]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = phi <4 x i64> [ zeroinitializer, %[[IF_END29:.*]] ], [ [[TMP14:%.*]], %[[IF_END25:.*]] ] ; CHECK-NEXT: br i1 false, label %[[S:.*]], label %[[Q:.*]] ; CHECK: [[Q]]: ; CHECK-NEXT: [[XOR39:%.*]] = phi i64 [ 0, %[[P]] ], [ 0, %[[LAND_LHS_TRUE:.*]] ] ; CHECK-NEXT: [[TMP16:%.*]] = phi <2 x i64> [ zeroinitializer, %[[P]] ], [ zeroinitializer, %[[LAND_LHS_TRUE]] ] -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP0]], i64 [[XOR39]], i32 3 ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP16]], <2 x i64> poison, <4 x i32> ; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> [[TMP18]], <4 x i32> ; CHECK-NEXT: br i1 false, label %[[LOR_LHS_FALSE:.*]], label %[[R:.*]] ; CHECK: [[LOR_LHS_FALSE]]: +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> [[TMP19]], <4 x i32> ; CHECK-NEXT: br i1 false, label %[[LAND_LHS_TRUE]], label %[[S]] ; CHECK: [[R]]: -; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i64> [ [[TMP19]], %[[Q]] ], [ [[TMP20:%.*]], %[[IF_THEN19]] ] +; CHECK-NEXT: [[TMP17:%.*]] = phi <4 x i64> [ [[TMP19]], %[[Q]] ], [ [[TMP20:%.*]], %[[IF_THEN19]] ] ; CHECK-NEXT: br i1 false, label %[[S]], label %[[LAND_LHS_TRUE]] ; CHECK: [[LAND_LHS_TRUE]]: -; CHECK-NEXT: [[TMP22:%.*]] = phi <4 x i64> [ [[TMP21]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ] +; CHECK-NEXT: [[TMP21:%.*]] = phi <4 x i64> [ [[TMP17]], %[[R]] ], [ zeroinitializer, %[[LOR_LHS_FALSE]] ] ; CHECK-NEXT: br i1 false, label %[[Q]], label %[[S]] ; CHECK: [[S]]: -; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP22]], %[[LAND_LHS_TRUE]] ], [ [[TMP21]], %[[R]] ], [ [[TMP19]], %[[LOR_LHS_FALSE]] ], [ [[TMP17]], %[[P]] ] +; CHECK-NEXT: [[TMP10:%.*]] = phi <4 x i64> [ [[TMP21]], %[[LAND_LHS_TRUE]] ], [ [[TMP17]], %[[R]] ], [ [[TMP12]], %[[LOR_LHS_FALSE]] ], [ [[TMP3]], %[[P]] ] ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i64> [[TMP10]], <4 x i64> poison, <2 x i32> ; CHECK-NEXT: br label %[[IF_THEN19]] ; CHECK: [[IF_THEN19]]: -; CHECK-NEXT: [[TMP12:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11]], %[[S]] ] -; CHECK-NEXT: [[TMP13]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <2 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i64> [[TMP12]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP20]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = phi <2 x i64> [ zeroinitializer, %[[ENTRY]] ], [ [[TMP11]], %[[S]] ] +; CHECK-NEXT: [[TMP14]] = shufflevector <2 x i64> [[TMP13]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i64> [[TMP14]], <4 x i64> zeroinitializer, <4 x i32> +; CHECK-NEXT: [[TMP20]] = shufflevector <4 x i64> [[TMP15]], <4 x i64> [[TMP9]], <4 x i32> ; CHECK-NEXT: br i1 false, label %[[R]], label %[[IF_END25]] ; CHECK: [[IF_END25]]: ; CHECK-NEXT: br i1 false, label %[[IF_END29]], label %[[P]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll index 42ed26d82e036..e74b1f09114eb 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gather-node-same-reduced.ll @@ -5,7 +5,8 @@ define i64 @test(ptr %p) { ; CHECK-LABEL: define i64 @test( ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 12 -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <3 x i32> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP13]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer @@ -87,7 +88,8 @@ define i64 @test1(ptr %p) { ; CHECK-LABEL: define i64 @test1( ; CHECK-SAME: ptr [[P:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[P]], i64 12 -; CHECK-NEXT: [[TMP2:%.*]] = xor <4 x i32> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP13:%.*]] = xor <3 x i32> zeroinitializer, zeroinitializer +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x i32> [[TMP13]], <3 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = xor <4 x i32> [[TMP2]], zeroinitializer ; CHECK-NEXT: [[TMP4:%.*]] = xor <4 x i32> [[TMP3]], zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP4]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll index e42e6183b8cae..889ed393e3f45 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/gathered-shuffle-resized.ll @@ -17,10 +17,14 @@ define ptr @test(ptr %0, ptr %args_gep) { ; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[ARG1]], i64 12 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr [[TMP6]], align 8, !noalias [[META0:![0-9]+]] ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !noalias [[META0]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> [[TMP9]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <7 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <9 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <9 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <9 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <9 x i32> [[TMP11]], <9 x i32> [[TMP18]], <9 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <7 x i32> [[TMP10]], <7 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <9 x i32> [[TMP19]], <9 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x i32> [[TMP20]], <16 x i32> [[TMP21]], <16 x i32> ; CHECK-NEXT: [[TMP14:%.*]] = icmp eq <16 x i32> [[TMP13]], zeroinitializer ; CHECK-NEXT: [[TMP15:%.*]] = zext <16 x i1> [[TMP14]] to <16 x i8> ; CHECK-NEXT: store <16 x i8> [[TMP15]], ptr [[TMP5]], align 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll index f07424f0d2934..14fdd17753805 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/identity-match-splat-less-defined.ll @@ -3,29 +3,18 @@ define i32 @test() { ; CHECK-LABEL: define i32 @test() { +; CHECK-NEXT: [[TMP2:%.*]] = or i32 0, 0 ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = or <4 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <16 x i32> -; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP25]], zeroinitializer -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <24 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP5]], <64 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <64 x i32> [[TMP9]], <64 x i32> , <64 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <64 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <64 x i32> [[TMP10]], <64 x i32> [[TMP12]], <64 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <64 x i32> , i32 [[TMP2]], i32 7 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <64 x i32> [[TMP8]], <64 x i32> poison, <23 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <23 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <64 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <64 x i32> [[TMP13]], <64 x i32> [[TMP15]], <64 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <24 x i32> [[TMP6]], <24 x i32> poison, <64 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <64 x i32> [[TMP16]], <64 x i32> [[TMP15]], <64 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <16 x i32> [[TMP4]], <16 x i32> poison, <64 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <64 x i32> [[TMP27]], <64 x i32> [[TMP28]], <64 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <64 x i32> zeroinitializer, [[TMP18]] -; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <64 x i32> zeroinitializer, [[TMP18]] +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <64 x i32> [[TMP8]], <64 x i32> [[TMP15]], <23 x i32> +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <23 x i32> [[TMP9]], <23 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = icmp eq <64 x i32> [[TMP18]], zeroinitializer +; CHECK-NEXT: [[TMP20:%.*]] = icmp ne <64 x i32> [[TMP18]], zeroinitializer ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <64 x i1> [[TMP19]], <64 x i1> [[TMP20]], <64 x i32> ; CHECK-NEXT: [[TMP22:%.*]] = zext <64 x i1> [[TMP21]] to <64 x i8> ; CHECK-NEXT: [[TMP23:%.*]] = call i8 @llvm.vector.reduce.xor.v64i8(<64 x i8> [[TMP22]]) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll index 0fddb7322e9b3..989da443dd7fd 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insert-subvector.ll @@ -11,21 +11,16 @@ define <16 x double> @test(ptr %x, double %v, double %a) { ; CHECK-NEXT: [[TMP7:%.*]] = load <2 x double>, ptr [[GEP8]], align 4 ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x double> poison, double [[A]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x double> [[TMP4]], <16 x double> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x double> poison, double [[V]], i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP2]], <4 x double> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> poison, double [[V]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> poison, <16 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x double> [[TMP10]], <16 x double> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <16 x i32> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x double> [[TMP12]], <16 x double> [[TMP13]], <16 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <6 x double> poison, double [[V]], i32 0 +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <6 x double> [[TMP9]], <6 x double> poison, <6 x i32> zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP16]], <6 x double> poison, <16 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x double> [[TMP14]], <16 x double> [[TMP15]], <16 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x double> [[TMP9]], <2 x double> poison, <16 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x double> [[TMP16]], <16 x double> [[TMP20]], <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <16 x double> [[TMP21]], <16 x double> [[TMP20]], <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x double> [[TMP19]], <16 x double> [[TMP20]], <16 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP7]], <2 x double> poison, <6 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <6 x double> [[TMP16]], <6 x double> [[TMP10]], <16 x i32> +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <16 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <16 x double> [[TMP11]], <16 x double> [[TMP12]], <16 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x double> [[TMP6]], <2 x double> poison, <16 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = shufflevector <16 x double> [[TMP13]], <16 x double> [[TMP14]], <16 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = fadd <16 x double> [[TMP5]], [[TMP17]] ; CHECK-NEXT: ret <16 x double> [[TMP18]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll index 4f94784a24dd4..86d20eb20cf2f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s ;unsigned load_le32(unsigned char *data) { ; unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24); @@ -51,24 +51,10 @@ entry: } define <4 x float> @PR16739_byref(ptr nocapture readonly dereferenceable(16) %x) { -; AVX2-LABEL: @PR16739_byref( -; AVX2-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2 -; AVX2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4 -; AVX2-NEXT: [[X2:%.*]] = load float, ptr [[GEP2]], align 4 -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; AVX2-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2 -; AVX2-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 -; AVX2-NEXT: ret <4 x float> [[I3]] -; -; AVX512-LABEL: @PR16739_byref( -; AVX512-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 1 -; AVX512-NEXT: [[X0:%.*]] = load float, ptr [[X]], align 4 -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; AVX512-NEXT: [[I0:%.*]] = insertelement <4 x float> poison, float [[X0]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; AVX512-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP2]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP2]], <4 x i32> -; AVX512-NEXT: ret <4 x float> [[TMP3]] +; CHECK-LABEL: @PR16739_byref( +; CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %gep1 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 1 %gep2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2 @@ -103,10 +89,11 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 ; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <3 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> [[TMP2]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <3 x i64> [[TMP3]] to <3 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <3 x i32> [[TMP4]] to <3 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %t1 = load i64, ptr %x, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll index 700e3ed9effc4..7efd1b16e6862 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s --check-prefixes=CHECK,AVX2 -; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512 +; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=haswell | FileCheck %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-hor -slp-vectorize-hor-store -S < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64-v4 | FileCheck %s ;unsigned load_le32(unsigned char *data) { ; unsigned le32 = (data[0]<<0) | (data[1]<<8) | (data[2]<<16) | (data[3]<<24); @@ -51,24 +51,10 @@ entry: } define <4 x float> @PR16739_byref(ptr nocapture readonly dereferenceable(16) %x) { -; AVX2-LABEL: @PR16739_byref( -; AVX2-NEXT: [[GEP2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 2 -; AVX2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[X]], align 4 -; AVX2-NEXT: [[X2:%.*]] = load float, ptr [[GEP2]], align 4 -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; AVX2-NEXT: [[I2:%.*]] = insertelement <4 x float> [[TMP2]], float [[X2]], i32 2 -; AVX2-NEXT: [[I3:%.*]] = insertelement <4 x float> [[I2]], float [[X2]], i32 3 -; AVX2-NEXT: ret <4 x float> [[I3]] -; -; AVX512-LABEL: @PR16739_byref( -; AVX512-NEXT: [[GEP1:%.*]] = getelementptr inbounds <4 x float>, ptr [[X:%.*]], i64 0, i64 1 -; AVX512-NEXT: [[X0:%.*]] = load float, ptr [[X]], align 4 -; AVX512-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 -; AVX512-NEXT: [[I0:%.*]] = insertelement <4 x float> undef, float [[X0]], i32 0 -; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> -; AVX512-NEXT: [[I21:%.*]] = shufflevector <4 x float> [[I0]], <4 x float> [[TMP2]], <4 x i32> -; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[I21]], <4 x float> [[TMP2]], <4 x i32> -; AVX512-NEXT: ret <4 x float> [[TMP3]] +; CHECK-LABEL: @PR16739_byref( +; CHECK-NEXT: [[TMP1:%.*]] = load <3 x float>, ptr [[X:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <4 x i32> +; CHECK-NEXT: ret <4 x float> [[TMP2]] ; %gep1 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 1 %gep2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2 @@ -103,10 +89,11 @@ define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 ; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <3 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <3 x i64> [[TMP2]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <3 x i64> [[TMP3]] to <3 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = bitcast <3 x i32> [[TMP4]] to <3 x float> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <3 x float> [[TMP6]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %t1 = load i64, ptr %x, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll index f18a72b0bf776..10588da5ef663 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-partial-vector-shuffle.ll @@ -7,33 +7,14 @@ define <2 x i64> @load_00123456(ptr nocapture noundef readonly %data) { ; SSE-LABEL: @load_00123456( -; SSE-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 1 -; SSE-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 2 -; SSE-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3 -; SSE-NEXT: [[T0:%.*]] = load i16, ptr [[DATA]], align 2 -; SSE-NEXT: [[T1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2 -; SSE-NEXT: [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 -; SSE-NEXT: [[TMP1:%.*]] = load <4 x i16>, ptr [[ARRAYIDX3]], align 2 -; SSE-NEXT: [[VECINIT0_I_I:%.*]] = insertelement <8 x i16> undef, i16 [[T0]], i64 0 -; SSE-NEXT: [[VECINIT1_I_I:%.*]] = insertelement <8 x i16> [[VECINIT0_I_I]], i16 [[T0]], i64 1 -; SSE-NEXT: [[VECINIT2_I_I:%.*]] = insertelement <8 x i16> [[VECINIT1_I_I]], i16 [[T1]], i64 2 -; SSE-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I]], i16 [[T2]], i64 3 -; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[TMP1]], <4 x i16> poison, <8 x i32> -; SSE-NEXT: [[VECINIT7_I_I:%.*]] = shufflevector <8 x i16> [[VECINIT3_I_I]], <8 x i16> [[TMP2]], <8 x i32> +; SSE-NEXT: [[TMP1:%.*]] = load <7 x i16>, ptr [[DATA:%.*]], align 2 +; SSE-NEXT: [[VECINIT7_I_I:%.*]] = shufflevector <7 x i16> [[TMP1]], <7 x i16> poison, <8 x i32> ; SSE-NEXT: [[T7:%.*]] = bitcast <8 x i16> [[VECINIT7_I_I]] to <2 x i64> ; SSE-NEXT: ret <2 x i64> [[T7]] ; ; AVX-LABEL: @load_00123456( -; AVX-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[DATA:%.*]], i64 2 -; AVX-NEXT: [[ARRAYIDX3:%.*]] = getelementptr inbounds i16, ptr [[DATA]], i64 3 -; AVX-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr [[DATA]], align 2 -; AVX-NEXT: [[T2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2 -; AVX-NEXT: [[TMP2:%.*]] = load <4 x i16>, ptr [[ARRAYIDX3]], align 2 -; AVX-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <8 x i32> -; AVX-NEXT: [[VECINIT2_I_I2:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> [[TMP3]], <8 x i32> -; AVX-NEXT: [[VECINIT3_I_I:%.*]] = insertelement <8 x i16> [[VECINIT2_I_I2]], i16 [[T2]], i64 3 -; AVX-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[TMP2]], <4 x i16> poison, <8 x i32> -; AVX-NEXT: [[VECINIT7_I_I1:%.*]] = shufflevector <8 x i16> [[VECINIT3_I_I]], <8 x i16> [[TMP4]], <8 x i32> +; AVX-NEXT: [[TMP1:%.*]] = load <7 x i16>, ptr [[DATA:%.*]], align 2 +; AVX-NEXT: [[VECINIT7_I_I1:%.*]] = shufflevector <7 x i16> [[TMP1]], <7 x i16> poison, <8 x i32> ; AVX-NEXT: [[T7:%.*]] = bitcast <8 x i16> [[VECINIT7_I_I1]] to <2 x i64> ; AVX-NEXT: ret <2 x i64> [[T7]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll index 842bd6c6bec37..9db0320835bf3 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/malformed_phis.ll @@ -103,9 +103,8 @@ define i64 @test_3() #0 { ; CHECK: bb2: ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x i32> [ splat (i32 3), [[BB1]] ], [ poison, [[BB2:%.*]] ] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <28 x i32> -; CHECK-NEXT: [[VAL4:%.*]] = extractelement <28 x i32> [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = phi <28 x i32> [ splat (i32 3), [[BB1]] ], [ poison, [[BB2:%.*]] ] +; CHECK-NEXT: [[VAL4:%.*]] = extractelement <28 x i32> [[TMP3]], i32 1 ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <32 x i32> poison, i32 [[VAL4]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <32 x i32> [[TMP0]], <32 x i32> poison, <32 x i32> zeroinitializer ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i32> [[TMP1]], <32 x i32> poison, <28 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll index 04359eb6fcd7c..9f3a45b7a15b4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/non-power-of-2-order-detection.ll @@ -16,10 +16,10 @@ define void @e(ptr %c, i64 %0) { ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x ptr> [[TMP5]], <2 x ptr> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <6 x ptr> poison, ptr [[TMP2]], i32 2 ; CHECK-NEXT: [[TMP8:%.*]] = insertelement <6 x ptr> [[TMP7]], ptr [[TMP1]], i32 3 -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <6 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <6 x ptr> [[TMP8]], <6 x ptr> [[TMP19]], <6 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x ptr> [[TMP6]], <2 x ptr> poison, <6 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <6 x ptr> [[TMP20]], <6 x ptr> [[TMP21]], <6 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <6 x ptr> [[TMP8]], <6 x ptr> [[TMP21]], <6 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <2 x ptr> [[TMP4]], <2 x ptr> poison, <6 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <6 x ptr> [[TMP19]], <6 x ptr> [[TMP20]], <6 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint <6 x ptr> [[TMP10]] to <6 x i64> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x i64> [[TMP11]], <6 x i64> poison, <32 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = insertelement <32 x i64> poison, i64 [[TMP0]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index db38a62017391..ad920013ad013 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -643,7 +643,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] @@ -653,7 +652,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index bfa3610804967..d78d68820d4d5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -643,7 +643,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; ; AVX512F-LABEL: @gather_load_div( ; AVX512F-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512F-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512F-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512F-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] @@ -653,7 +652,6 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea ; ; AVX512VL-LABEL: @gather_load_div( ; AVX512VL-NEXT: [[TMP3:%.*]] = call <45 x float> @llvm.masked.load.v45f32.p0(ptr [[TMP1:%.*]], i32 4, <45 x i1> , <45 x float> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <16 x i32> ; AVX512VL-NEXT: [[TMP7:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP8:%.*]] = shufflevector <45 x float> [[TMP3]], <45 x float> poison, <8 x i32> ; AVX512VL-NEXT: [[TMP9:%.*]] = fdiv <8 x float> [[TMP7]], [[TMP8]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll index 782aada17acac..e0d1f1b0ff019 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47642.ll @@ -23,10 +23,9 @@ define <4 x i32> @foo(<4 x i32> %x, i32 %f) { define <4 x i32> @bar(<4 x i32> %x, i32 %f) { ; CHECK-LABEL: @bar( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[F:%.*]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP3:%.*]] = add nsw <2 x i32> [[TMP2]], -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[F:%.*]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i32> [[TMP2]], ; CHECK-NEXT: ret <4 x i32> [[TMP4]] ; %add = add nsw i32 %f, 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll index bada001ebbc6c..b8bc64d3bb728 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr49081.ll @@ -5,9 +5,9 @@ define dso_local <4 x float> @foo(<4 x i32> %0) { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i32> [[TMP1:%.*]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = sitofp <4 x i32> [[TMP0]] to <4 x float> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[TMP2]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP0:%.*]], <4 x i32> poison, <3 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = sitofp <3 x i32> [[TMP2]] to <3 x float> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <3 x float> [[TMP4]], <3 x float> poison, <4 x i32> ; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %2 = extractelement <4 x i32> %0, i32 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll index 2612a21b9eedf..b8a3f9de76039 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduced-val-vectorized-in-transform.ll @@ -13,7 +13,7 @@ define i32 @test(i1 %cond) { ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> , <4 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[P1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP10]], <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> zeroinitializer, [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = or <4 x i32> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[OR92]] = or i32 1, 0 ; CHECK-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.xor.v4i32(<4 x i32> [[TMP5]]) ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i32> , i32 [[TMP6]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll index 7bb436b9543bf..55a22731aaf9f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather.ll @@ -5,7 +5,6 @@ define void @test(ptr noalias %0, ptr %p) { ; CHECK-LABEL: @test( ; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 2 ; CHECK-NEXT: [[TMP3:%.*]] = call <16 x float> @llvm.masked.load.v16f32.p0(ptr [[P:%.*]], i32 16, <16 x i1> , <16 x float> poison) -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> , <16 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = fadd reassoc nsz arcp contract afn <16 x float> [[TMP7]], [[TMP9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index 63dbf3ce78c32..72602ee2a74ed 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -9,7 +9,6 @@ define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 ; CHECK-NEXT: [[TMP5:%.*]] = call <6 x float> @llvm.masked.load.v6f32.p1(ptr addrspace(1) [[TMP3]], i32 4, <6 x i1> , <6 x float> poison) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <6 x float> [[TMP5]], <6 x float> poison, <8 x i32> ; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll index 7d65fe1bcde76..401ab79a08b3a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reordered-masked-loads.ll @@ -9,16 +9,14 @@ define void @test() { ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: [[ENTRY:.*:]] ; CHECK-NEXT: [[M1:%.*]] = alloca [[STRUCT_AE:%.*]], align 8 -; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 48 ; CHECK-NEXT: [[ARRAYIDX_I4:%.*]] = getelementptr i8, ptr null, i64 16 +; CHECK-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr i8, ptr [[M1]], i64 40 ; CHECK-NEXT: [[TMP1:%.*]] = load <5 x double>, ptr [[M1]], align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = load <6 x double>, ptr [[M1]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP4]], <6 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr [[ARRAYIDX_I5_I]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <5 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x double> [[TMP7]], <5 x double> [[TMP1]], <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = load <3 x double>, ptr [[ARRAYIDX_I5_I]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <3 x double> [[TMP4]], <3 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <3 x double> [[TMP4]], <3 x double> poison, <5 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <5 x double> [[TMP1]], <5 x double> [[TMP3]], <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <5 x double> [[TMP3]], <5 x double> [[TMP1]], <4 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = fadd <4 x double> [[TMP8]], [[TMP5]] ; CHECK-NEXT: [[TMP10:%.*]] = fptosi <4 x double> [[TMP9]] to <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = sitofp <4 x i32> [[TMP10]] to <4 x double> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll index 61294089fd4cb..22734f56813bc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/replaced-external-in-reduction.ll @@ -5,10 +5,11 @@ define void @test(i32 %0, ptr %p) { ; CHECK-LABEL: define void @test( ; CHECK-SAME: i32 [[TMP0:%.*]], ptr [[P:%.*]]) { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = xor <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> , i32 [[TMP0]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = xor <8 x i32> [[TMP2]], ; CHECK-NEXT: [[OP_RDX:%.*]] = extractelement <8 x i32> [[TMP8]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP8]], <8 x i32> poison, <4 x i32> ; CHECK-NEXT: br i1 false, label [[EXIT:%.*]], label [[PH:%.*]] ; CHECK: ph: ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> , i32 [[TMP0]], i32 4 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll index 92a1e289044d7..cea0b1fb126c1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/resized-bv-values-non-power-of2-node.ll @@ -4,52 +4,56 @@ define <16 x half> @test(i32 %0, float %1, i32 %2) { ; CHECK-LABEL: define <16 x half> @test( ; CHECK-SAME: i32 [[TMP0:%.*]], float [[TMP1:%.*]], i32 [[TMP2:%.*]]) { -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x float> , float [[TMP1]], i32 13 -; CHECK-NEXT: [[TMP5:%.*]] = bitcast <16 x float> [[TMP4]] to <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <13 x float> , float [[TMP1]], i32 11 +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <13 x float> [[TMP4]] to <13 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> zeroinitializer, <2 x float> zeroinitializer, <2 x float> zeroinitializer) ; CHECK-NEXT: [[TMP7:%.*]] = icmp ugt i32 [[TMP2]], 0 ; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[TMP9:%.*]] = extractelement <2 x float> [[TMP6]], i32 0 -; CHECK-NEXT: [[TMP10:%.*]] = fcmp ogt float [[TMP9]], 0.000000e+00 ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP0]], 0 -; CHECK-NEXT: [[TMP13:%.*]] = extractelement <16 x i32> [[TMP5]], i32 10 -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast float 0.000000e+00 to i32 +; CHECK-NEXT: [[TMP15:%.*]] = extractelement <13 x i32> [[TMP5]], i32 8 ; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i32 [[TMP15]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = bitcast float 0.000000e+00 to i32 +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 0 ; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i32 [[TMP0]], 0 ; CHECK-NEXT: [[TMP18:%.*]] = icmp ult i32 0, 0 ; CHECK-NEXT: [[TMP19:%.*]] = icmp ult i32 0, 0 ; CHECK-NEXT: [[TMP20:%.*]] = icmp ult i32 0, 0 ; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i32 0, 0 -; CHECK-NEXT: [[TMP22:%.*]] = extractelement <16 x i32> [[TMP5]], i32 4 -; CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP22]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = extractelement <13 x i32> [[TMP5]], i32 3 +; CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP22]], 0 ; CHECK-NEXT: [[TMP24:%.*]] = icmp eq i32 [[TMP0]], [[TMP0]] -; CHECK-NEXT: [[TMP25:%.*]] = bitcast float 0.000000e+00 to i32 -; CHECK-NEXT: [[TMP26:%.*]] = icmp eq i32 [[TMP25]], 0 -; CHECK-NEXT: [[TMP27:%.*]] = icmp ult <16 x i32> [[TMP5]], zeroinitializer -; CHECK-NEXT: [[TMP28:%.*]] = select <16 x i1> [[TMP27]], <16 x i32> zeroinitializer, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP29:%.*]] = sitofp <16 x i32> [[TMP28]] to <16 x float> -; CHECK-NEXT: [[TMP30:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> [[TMP29]], <16 x float> zeroinitializer, <16 x float> zeroinitializer) -; CHECK-NEXT: [[TMP31:%.*]] = fadd <16 x float> [[TMP30]], zeroinitializer -; CHECK-NEXT: [[TMP35:%.*]] = select <16 x i1> zeroinitializer, <16 x float> zeroinitializer, <16 x float> [[TMP31]] -; CHECK-NEXT: [[TMP36:%.*]] = bitcast <16 x float> [[TMP35]] to <16 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = and <16 x i32> [[TMP36]], zeroinitializer -; CHECK-NEXT: [[TMP38:%.*]] = bitcast <16 x i32> [[TMP37]] to <16 x float> +; CHECK-NEXT: [[TMP23:%.*]] = bitcast float 0.000000e+00 to i32 +; CHECK-NEXT: [[TMP32:%.*]] = icmp eq i32 [[TMP23]], 0 +; CHECK-NEXT: [[TMP25:%.*]] = icmp ult <13 x i32> [[TMP5]], zeroinitializer +; CHECK-NEXT: [[TMP26:%.*]] = select <13 x i1> [[TMP25]], <13 x i32> zeroinitializer, <13 x i32> zeroinitializer +; CHECK-NEXT: [[TMP27:%.*]] = sitofp <13 x i32> [[TMP26]] to <13 x float> +; CHECK-NEXT: [[TMP28:%.*]] = shufflevector <13 x float> [[TMP27]], <13 x float> poison, <14 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = call <14 x float> @llvm.fmuladd.v14f32(<14 x float> [[TMP28]], <14 x float> zeroinitializer, <14 x float> zeroinitializer) +; CHECK-NEXT: [[TMP30:%.*]] = fadd <14 x float> [[TMP29]], zeroinitializer +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <14 x float> [[TMP30]], <14 x float> poison, <15 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = insertelement <15 x i1> , i1 false, i32 0 +; CHECK-NEXT: [[TMP50:%.*]] = insertelement <15 x i1> [[TMP49]], i1 false, i32 5 +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <15 x i1> [[TMP50]], <15 x i1> poison, <15 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = select <15 x i1> [[TMP51]], <15 x float> zeroinitializer, <15 x float> [[TMP55]] +; CHECK-NEXT: [[TMP57:%.*]] = bitcast <15 x float> [[TMP56]] to <15 x i32> +; CHECK-NEXT: [[TMP58:%.*]] = and <15 x i32> [[TMP57]], zeroinitializer +; CHECK-NEXT: [[TMP59:%.*]] = bitcast <15 x i32> [[TMP58]] to <15 x float> +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <15 x float> [[TMP59]], <15 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP53:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x float> , <16 x float> [[TMP53]], <16 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> zeroinitializer, <16 x float> [[TMP38]], <16 x float> [[TMP39]]) -; CHECK-NEXT: [[TMP41:%.*]] = extractelement <16 x float> [[TMP29]], i32 0 -; CHECK-NEXT: [[TMP42:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00 -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <16 x float> [[TMP29]], i32 14 -; CHECK-NEXT: [[TMP44:%.*]] = fcmp ogt float [[TMP43]], 0.000000e+00 -; CHECK-NEXT: [[TMP45:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00 -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <16 x float> [[TMP29]], i32 13 -; CHECK-NEXT: [[TMP47:%.*]] = fcmp ogt float [[TMP46]], 0.000000e+00 -; CHECK-NEXT: [[TMP48:%.*]] = fcmp olt float [[TMP46]], 0.000000e+00 -; CHECK-NEXT: [[TMP49:%.*]] = fcmp olt float [[TMP41]], 0.000000e+00 -; CHECK-NEXT: [[TMP50:%.*]] = extractelement <16 x float> [[TMP29]], i32 1 -; CHECK-NEXT: [[TMP51:%.*]] = fcmp ogt float [[TMP50]], 0.000000e+00 +; CHECK-NEXT: [[TMP43:%.*]] = extractelement <14 x float> [[TMP28]], i32 0 +; CHECK-NEXT: [[TMP44:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00 +; CHECK-NEXT: [[TMP45:%.*]] = extractelement <14 x float> [[TMP28]], i32 13 +; CHECK-NEXT: [[TMP46:%.*]] = fcmp olt float [[TMP45]], 0.000000e+00 +; CHECK-NEXT: [[TMP47:%.*]] = extractelement <14 x float> [[TMP28]], i32 12 +; CHECK-NEXT: [[TMP48:%.*]] = fcmp olt float [[TMP47]], 0.000000e+00 +; CHECK-NEXT: [[TMP54:%.*]] = fcmp olt float [[TMP43]], 0.000000e+00 +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <14 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <13 x i32> +; CHECK-NEXT: [[TMP62:%.*]] = shufflevector <13 x float> [[TMP27]], <13 x float> [[TMP61]], <4 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = fcmp ogt <4 x float> [[TMP62]], zeroinitializer ; CHECK-NEXT: [[TMP52:%.*]] = fcmp oeq <16 x float> [[TMP40]], zeroinitializer ; CHECK-NEXT: ret <16 x half> zeroinitializer ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll index b900bd3a8c331..67d43fe31b5f6 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reuse-extracts-in-wider-vect.ll @@ -14,12 +14,12 @@ define i32 @foo(i32 %0, ptr %1, ptr %2, i1 %arg) { ; CHECK-NEXT: br label [[T37:%.*]] ; CHECK: t37: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x float> [ [[TMP5]], [[TMP3:%.*]] ], [ [[T89:%.*]], [[T37]] ] -; CHECK-NEXT: [[TMP7:%.*]] = fdiv fast <2 x float> splat (float 1.000000e+00), [[TMP6]] -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP6]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[SHUFFLE:%.*]] = fdiv fast <4 x float> splat (float 1.000000e+00), [[TMP7]] ; CHECK-NEXT: [[T21:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[T4]], i64 0, i32 2, i64 0 ; CHECK-NEXT: store <4 x float> [[SHUFFLE]], ptr [[T21]], align 4 ; CHECK-NEXT: [[T89]] = load <2 x float>, ptr [[T9]], align 4 -; CHECK-NEXT: br i1 %arg, label [[T37]], label [[T55:%.*]] +; CHECK-NEXT: br i1 [[ARG:%.*]], label [[T37]], label [[T55:%.*]] ; CHECK: t55: ; CHECK-NEXT: ret i32 0 ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll index f7811aba5ab5f..57b46274f4a13 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll @@ -4,20 +4,13 @@ define <4 x i16> @test() { ; CHECK-LABEL: define <4 x i16> @test() { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer -; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]]) +; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer) ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]]) +; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer) ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1 -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]]) +; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer) ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2 -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]]) +; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> zeroinitializer) ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3 ; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer ; CHECK-NEXT: ret <4 x i16> [[OP_RDX9]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll index 9c0f65ec27165..38f49791c570d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/same-values-sub-node-with-poisons.ll @@ -22,17 +22,17 @@ define i32 @test(ptr %f, i1 %tobool.i.4, i32 %retval.0.i.219) { ; CHECK: [[D_EXIT_6]]: ; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP1]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ [[TMP1]], %[[D_EXIT_4]] ] ; CHECK-NEXT: [[TMP4:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ [[TMP2]], %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] -; CHECK-NEXT: [[TMP8:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] +; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ poison, %[[IF_END_I_5]] ], [ zeroinitializer, %[[D_EXIT_3]] ], [ poison, %[[IF_END_I_2]] ], [ zeroinitializer, %[[D_EXIT_4]] ] +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: br label %[[D_EXIT_7]] ; CHECK: [[D_EXIT_7]]: ; CHECK-NEXT: [[TMP6:%.*]] = phi <2 x i32> [ [[TMP3]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] ; CHECK-NEXT: [[TMP7:%.*]] = phi <2 x i32> [ [[TMP4]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] -; CHECK-NEXT: [[TMP5:%.*]] = phi <2 x i32> [ [[TMP8]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = phi <4 x i32> [ [[TMP13]], %[[D_EXIT_6]] ], [ poison, %[[IF_END_I_5]] ] ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> , i32 [[TMP0]], i32 0 ; CHECK-NEXT: [[TMP15:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[RETVAL_0_I_219]], i32 3 -; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP13]] +; CHECK-NEXT: [[TMP16:%.*]] = add <4 x i32> [[TMP15]], [[TMP9]] ; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <2 x i32> [[TMP6]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x i32> [[TMP14]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll index 648f051db4a52..c796320069e42 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder.ll @@ -57,11 +57,11 @@ define internal i32 @ipvideo_decode_block_opcode_0xD_16() { ; CHECK-NEXT: entry: ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i16> [ undef, [[ENTRY:%.*]] ], [ [[TMP0]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = phi <8 x i16> [ undef, [[ENTRY:%.*]] ], [ [[TMP2:%.*]], [[IF_END:%.*]] ] ; CHECK-NEXT: br label [[IF_END]] ; CHECK: if.end: ; CHECK-NEXT: store <8 x i16> [[TMP1]], ptr undef, align 2 +; CHECK-NEXT: [[TMP2]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> ; CHECK-NEXT: br label [[FOR_BODY]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index b4996eb58b47e..798c94e84c99d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -20,7 +20,6 @@ define void @test() { ; CHECK-NEXT: [[SIN2:%.*]] = call fast double @llvm.sin.f64(double [[A6]]) ; CHECK-NEXT: [[SIN3:%.*]] = call fast double @llvm.sin.f64(double [[A7]]) ; CHECK-NEXT: [[TMP1:%.*]] = load <6 x double>, ptr @src, align 8 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP6]]) ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <6 x double> [[TMP1]], <6 x double> poison, <2 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll b/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll index 5e52bd7cc954c..27217459fa7b8 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/whole-registers-compare.ll @@ -12,9 +12,9 @@ define void @test() { ; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x float> , float [[VAL_SROA_6_1]], i32 1 ; CHECK-NEXT: [[TMP2:%.*]] = fcmp uge <2 x float> [[TMP1]], zeroinitializer ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x i1> [[TMP2]], <2 x i1> poison, <12 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <12 x i1> , <12 x i1> [[TMP3]], <12 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <12 x i1> [[TMP4]], i1 [[CMP119]], i32 11 -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i1> [[TMP5]], <12 x i1> poison, <12 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <12 x i1> , <12 x i1> [[TMP3]], <12 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <12 x i1> [[TMP4]], i1 [[CMP119]], i32 10 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <12 x i1> [[TMP5]], <12 x i1> poison, <12 x i32> ; CHECK-NEXT: [[TMP7]] = select <12 x i1> [[TMP6]], <12 x float> zeroinitializer, <12 x float> zeroinitializer ; CHECK-NEXT: br label %[[DO_BODY]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll index 32e59697486a7..ee98c6ec29a54 100644 --- a/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/extract-many-users-buildvector.ll @@ -7,28 +7,31 @@ define i1 @test(float %0, double %1) { ; X86-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { ; X86-NEXT: [[TMP3:%.*]] = insertelement <4 x float> , float [[TMP0]], i32 3 ; X86-NEXT: [[TMP4:%.*]] = fpext <4 x float> [[TMP3]] to <4 x double> -; X86-NEXT: [[TMP5:%.*]] = insertelement <6 x double> , double [[TMP1]], i32 4 -; X86-NEXT: [[TMP6:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <6 x i32> -; X86-NEXT: [[TMP7:%.*]] = shufflevector <6 x double> [[TMP5]], <6 x double> [[TMP6]], <6 x i32> -; X86-NEXT: [[TMP8:%.*]] = fmul <6 x double> zeroinitializer, [[TMP7]] -; X86-NEXT: [[TMP9:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> [[TMP8]], <4 x i32> -; X86-NEXT: [[TMP10:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> , <4 x i32> -; X86-NEXT: [[TMP11:%.*]] = shufflevector <6 x double> [[TMP7]], <6 x double> poison, <4 x i32> -; X86-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP11]], <4 x double> , <4 x i32> -; X86-NEXT: [[TMP13:%.*]] = fmul <4 x double> [[TMP10]], [[TMP12]] -; X86-NEXT: [[TMP14:%.*]] = shufflevector <4 x double> [[TMP13]], <4 x double> poison, <8 x i32> -; X86-NEXT: [[TMP15:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP14]], <8 x i32> -; X86-NEXT: [[TMP16:%.*]] = shufflevector <6 x double> [[TMP8]], <6 x double> poison, <8 x i32> -; X86-NEXT: [[TMP17:%.*]] = shufflevector <8 x double> , <8 x double> [[TMP16]], <8 x i32> -; X86-NEXT: [[TMP18:%.*]] = fsub <8 x double> [[TMP15]], [[TMP17]] -; X86-NEXT: [[TMP19:%.*]] = fmul <8 x double> [[TMP15]], [[TMP17]] -; X86-NEXT: [[TMP20:%.*]] = shufflevector <8 x double> [[TMP18]], <8 x double> [[TMP19]], <8 x i32> -; X86-NEXT: [[TMP21:%.*]] = fptrunc <8 x double> [[TMP20]] to <8 x float> -; X86-NEXT: [[TMP22:%.*]] = fmul <8 x float> [[TMP21]], zeroinitializer -; X86-NEXT: [[TMP23:%.*]] = fcmp oeq <8 x float> [[TMP22]], zeroinitializer -; X86-NEXT: [[TMP24:%.*]] = freeze <8 x i1> [[TMP23]] -; X86-NEXT: [[TMP25:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP24]]) -; X86-NEXT: ret i1 [[TMP25]] +; X86-NEXT: [[TMP5:%.*]] = fmul double 0.000000e+00, 0.000000e+00 +; X86-NEXT: [[TMP6:%.*]] = insertelement <4 x double> , double [[TMP1]], i32 1 +; X86-NEXT: [[TMP7:%.*]] = insertelement <4 x double> [[TMP6]], double [[TMP5]], i32 2 +; X86-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <4 x i32> +; X86-NEXT: [[TMP9:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> , <4 x i32> +; X86-NEXT: [[TMP10:%.*]] = fmul <4 x double> [[TMP8]], [[TMP9]] +; X86-NEXT: [[TMP11:%.*]] = insertelement <5 x double> poison, double [[TMP1]], i32 4 +; X86-NEXT: [[TMP12:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> poison, <5 x i32> +; X86-NEXT: [[TMP13:%.*]] = shufflevector <5 x double> [[TMP11]], <5 x double> [[TMP12]], <5 x i32> +; X86-NEXT: [[TMP14:%.*]] = fmul <5 x double> zeroinitializer, [[TMP13]] +; X86-NEXT: [[TMP15:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <5 x i32> +; X86-NEXT: [[TMP16:%.*]] = shufflevector <5 x double> , <5 x double> [[TMP15]], <5 x i32> +; X86-NEXT: [[TMP17:%.*]] = fsub <5 x double> [[TMP16]], [[TMP14]] +; X86-NEXT: [[TMP18:%.*]] = shufflevector <4 x double> [[TMP7]], <4 x double> poison, <3 x i32> +; X86-NEXT: [[TMP19:%.*]] = shufflevector <3 x double> [[TMP18]], <3 x double> , <3 x i32> +; X86-NEXT: [[TMP20:%.*]] = fmul <3 x double> zeroinitializer, [[TMP19]] +; X86-NEXT: [[TMP21:%.*]] = shufflevector <5 x double> [[TMP17]], <5 x double> poison, <8 x i32> +; X86-NEXT: [[TMP22:%.*]] = shufflevector <3 x double> [[TMP20]], <3 x double> poison, <8 x i32> +; X86-NEXT: [[TMP23:%.*]] = shufflevector <8 x double> [[TMP21]], <8 x double> [[TMP22]], <8 x i32> +; X86-NEXT: [[TMP24:%.*]] = fptrunc <8 x double> [[TMP23]] to <8 x float> +; X86-NEXT: [[TMP25:%.*]] = fmul <8 x float> [[TMP24]], zeroinitializer +; X86-NEXT: [[TMP26:%.*]] = fcmp oeq <8 x float> [[TMP25]], zeroinitializer +; X86-NEXT: [[TMP27:%.*]] = freeze <8 x i1> [[TMP26]] +; X86-NEXT: [[TMP28:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP27]]) +; X86-NEXT: ret i1 [[TMP28]] ; ; AARCH64-LABEL: define i1 @test ; AARCH64-SAME: (float [[TMP0:%.*]], double [[TMP1:%.*]]) { diff --git a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll index 766e1fb50330b..580225456b90a 100644 --- a/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll +++ b/llvm/test/Transforms/SLPVectorizer/gathered-consecutive-loads-different-types.ll @@ -17,7 +17,7 @@ define i32 @test(i8 %0) { ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i8> [[TMP5]], <2 x i8> poison, <8 x i32> ; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <8 x i8> , <8 x i8> [[TMP6]], <8 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i8> [[TMP33]], i8 [[TMP2]], i32 5 -; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i8> zeroinitializer, [[TMP7]] +; CHECK-NEXT: [[TMP8:%.*]] = icmp eq <8 x i8> [[TMP7]], zeroinitializer ; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i48> , i48 [[TEST_STRUCTCOPY_14_S14_CM_COERCE_SROA_2_0_COPYLOAD]], i32 0 ; CHECK-NEXT: [[TMP10:%.*]] = trunc <4 x i48> [[TMP9]] to <4 x i8> ; CHECK-NEXT: [[TMP11:%.*]] = icmp eq <4 x i8> zeroinitializer, [[TMP10]] diff --git a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll index dbf24bc7721c9..a7fa1607d8f32 100644 --- a/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll +++ b/llvm/test/Transforms/SLPVectorizer/insertelement-across-zero.ll @@ -14,7 +14,7 @@ define void @test(i8 %0, i8 %1) { ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <8 x i8> [[TMP2]], <8 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <16 x i8> [[TMP5]], <16 x i8> [[TMP7]], <16 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <16 x i8> [[TMP8]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> zeroinitializer, [[TMP9]] +; CHECK-NEXT: [[TMP10:%.*]] = icmp ne <16 x i8> [[TMP9]], zeroinitializer ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll index 304af88b6d134..f05c846027910 100644 --- a/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/reduction-whole-regs-loads.ll @@ -7,13 +7,8 @@ define i64 @test(ptr %p) { ; RISCV-LABEL: @test( ; RISCV-NEXT: entry: -; RISCV-NEXT: [[ARRAYIDX_4:%.*]] = getelementptr inbounds i64, ptr [[P:%.*]], i64 4 -; RISCV-NEXT: [[TMP0:%.*]] = load <4 x i64>, ptr [[P]], align 4 -; RISCV-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[ARRAYIDX_4]], align 4 -; RISCV-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TMP0]], <4 x i64> poison, <8 x i32> -; RISCV-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> [[TMP2]], <8 x i32> -; RISCV-NEXT: [[TMP7:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <8 x i32> -; RISCV-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TMP3]], <8 x i64> [[TMP7]], <8 x i32> +; RISCV-NEXT: [[TMP0:%.*]] = load <6 x i64>, ptr [[P:%.*]], align 4 +; RISCV-NEXT: [[TMP4:%.*]] = shufflevector <6 x i64> [[TMP0]], <6 x i64> poison, <8 x i32> ; RISCV-NEXT: [[TMP5:%.*]] = mul <8 x i64> [[TMP4]], splat (i64 42) ; RISCV-NEXT: [[TMP6:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP5]]) ; RISCV-NEXT: ret i64 [[TMP6]] diff --git a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll index 940ee5b95871d..d6a35d21332ea 100644 --- a/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll @@ -39,10 +39,10 @@ define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) ; AARCH64-NEXT: [[TMP2:%.*]] = insertelement <8 x ptr> , ptr [[I242]], i32 0 ; AARCH64-NEXT: [[TMP3:%.*]] = insertelement <8 x ptr> [[TMP2]], ptr [[I250]], i32 2 ; AARCH64-NEXT: [[TMP4:%.*]] = icmp ult <8 x ptr> [[TMP3]], [[TMP1]] -; AARCH64-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <4 x i32> -; AARCH64-NEXT: [[TMP6:%.*]] = insertelement <4 x ptr> [[TMP5]], ptr [[I245]], i32 2 -; AARCH64-NEXT: [[TMP7:%.*]] = insertelement <4 x ptr> [[TMP6]], ptr [[I248]], i32 3 -; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x ptr> [[TMP7]], <4 x ptr> poison, <8 x i32> +; AARCH64-NEXT: [[TMP5:%.*]] = shufflevector <8 x ptr> [[TMP3]], <8 x ptr> poison, <8 x i32> +; AARCH64-NEXT: [[TMP6:%.*]] = insertelement <8 x ptr> [[TMP5]], ptr [[I245]], i32 2 +; AARCH64-NEXT: [[TMP7:%.*]] = insertelement <8 x ptr> [[TMP6]], ptr [[I248]], i32 3 +; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <8 x ptr> [[TMP7]], <8 x ptr> poison, <8 x i32> ; AARCH64-NEXT: [[TMP9:%.*]] = shufflevector <8 x ptr> [[TMP1]], <8 x ptr> , <8 x i32> ; AARCH64-NEXT: [[TMP10:%.*]] = icmp ult <8 x ptr> [[TMP8]], [[TMP9]] ; AARCH64-NEXT: [[TMP11:%.*]] = or <8 x i1> [[TMP4]], [[TMP10]] diff --git a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll index b738d25b39be1..7535f277fba4b 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec-shufflevector.ll @@ -7,10 +7,9 @@ define void @test1(ptr %in, ptr %out) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 ; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: store <8 x i64> [[TMP1]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void @@ -19,10 +18,9 @@ define void @test1(ptr %in, ptr %out) { ; COMBINE-NEXT: entry: ; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 ; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 -; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> +; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> ; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> ; COMBINE-NEXT: [[TMP5:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> -; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <16 x i32> ; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TMP5]], <8 x i64> poison, <8 x i32> ; COMBINE-NEXT: store <8 x i64> [[TMP1]], ptr [[OUT]], align 8 ; COMBINE-NEXT: ret void @@ -53,10 +51,9 @@ define void @test2(ptr %in, ptr %out) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 ; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> ; CHECK-NEXT: store <8 x i64> [[TMP2]], ptr [[OUT]], align 8 ; CHECK-NEXT: ret void @@ -65,10 +62,9 @@ define void @test2(ptr %in, ptr %out) { ; COMBINE-NEXT: entry: ; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 1 ; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i64, ptr [[OUT1:%.*]], i64 0 -; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> +; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> ; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> ; COMBINE-NEXT: [[TMP1:%.*]] = zext <8 x i32> [[TMP4]] to <8 x i64> -; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <16 x i32> ; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TMP1]], <8 x i64> poison, <8 x i32> ; COMBINE-NEXT: store <8 x i64> [[TMP2]], ptr [[OUT]], align 8 ; COMBINE-NEXT: ret void @@ -97,7 +93,7 @@ entry: define void @test3(<16 x i32> %0, ptr %out) { ; CHECK-LABEL: @test3( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> ; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 ; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 @@ -105,7 +101,7 @@ define void @test3(<16 x i32> %0, ptr %out) { ; ; COMBINE-LABEL: @test3( ; COMBINE-NEXT: entry: -; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> +; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP0:%.*]], <16 x i32> poison, <64 x i32> ; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> ; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 ; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 @@ -132,7 +128,7 @@ define void @test4(ptr %in, ptr %out) { ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4 ; CHECK-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 @@ -142,7 +138,7 @@ define void @test4(ptr %in, ptr %out) { ; COMBINE-NEXT: entry: ; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[IN:%.*]], align 4 ; COMBINE-NEXT: [[OUT:%.*]] = getelementptr inbounds i32, ptr [[OUT1:%.*]], i64 0 -; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> +; COMBINE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> ; COMBINE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <8 x i32> ; COMBINE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> poison, <16 x i32> ; COMBINE-NEXT: store <16 x i32> [[TMP1]], ptr [[OUT]], align 4 @@ -189,36 +185,29 @@ entry: define void @test6(ptr %in0, ptr %in1, ptr %in2) { ; CHECK-LABEL: @test6( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32 +; CHECK-NEXT: [[TMP0:%.*]] = load <12 x float>, ptr [[GEP1:%.*]], align 16 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <12 x float> [[TMP0]], <12 x float> poison, <32 x i32> ; CHECK-NEXT: [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16 -; CHECK-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP9:%.*]] = zext <32 x i8> [[TMP8]] to <32 x i16> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <64 x i32> ; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <32 x i16> [[TMP9]], <32 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP2:%.*]] = uitofp <32 x i16> [[TMP10]] to <32 x float> -; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x float> [[TMP14]], <16 x float> [[TMP19]], <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x float> [[TMP16]], <16 x float> poison, <32 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = fmul <32 x float> [[TMP3]], [[TMP2]] +; CHECK-NEXT: store <32 x float> [[TMP4]], ptr [[IN2:%.*]], align 16 ; CHECK-NEXT: [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32 -; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128 -; CHECK-NEXT: [[TMP17:%.*]] = load <8 x float>, ptr [[IN0]], align 16 -; CHECK-NEXT: store <32 x float> [[TMP4]], ptr [[IN2]], align 16 ; CHECK-NEXT: [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <32 x float> [[TMP3]], <32 x float> poison, <4 x i32> +; CHECK-NEXT: [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2]], i64 128 +; CHECK-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP15]] to <16 x i16> -; CHECK-NEXT: [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float> -; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <16 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x float> [[TMP19]], <16 x float> [[TMP21]], <16 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <8 x float> [[TMP17]], <8 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP16]], <16 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> poison, <16 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x float> [[TMP22]], <16 x float> [[TMP24]], <16 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <16 x i32> @@ -228,36 +217,29 @@ define void @test6(ptr %in0, ptr %in1, ptr %in2) { ; ; COMBINE-LABEL: @test6( ; COMBINE-NEXT: entry: -; COMBINE-NEXT: [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[IN0:%.*]], i64 32 +; COMBINE-NEXT: [[TMP0:%.*]] = load <12 x float>, ptr [[GEP1:%.*]], align 16 +; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <12 x float> [[TMP0]], <12 x float> poison, <32 x i32> ; COMBINE-NEXT: [[LOAD2:%.*]] = load <4 x float>, ptr [[GEP1]], align 16 -; COMBINE-NEXT: [[TMP0:%.*]] = load <8 x float>, ptr [[IN0]], align 16 ; COMBINE-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr [[IN1:%.*]], align 1 -; COMBINE-NEXT: [[TMP10:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <64 x i32> ; COMBINE-NEXT: [[TMP11:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> poison, <32 x i32> ; COMBINE-NEXT: [[TMP4:%.*]] = zext <32 x i8> [[TMP11]] to <32 x i16> -; COMBINE-NEXT: [[TMP12:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <64 x i32> ; COMBINE-NEXT: [[TMP19:%.*]] = shufflevector <32 x i16> [[TMP4]], <32 x i16> poison, <32 x i32> ; COMBINE-NEXT: [[TMP2:%.*]] = uitofp <32 x i16> [[TMP19]] to <32 x float> -; COMBINE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[TMP0]], <8 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP13:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP5:%.*]] = shufflevector <16 x float> [[TMP3]], <16 x float> [[TMP13]], <16 x i32> -; COMBINE-NEXT: [[TMP6:%.*]] = shufflevector <16 x float> [[TMP5]], <16 x float> poison, <32 x i32> ; COMBINE-NEXT: [[TMP7:%.*]] = fmul <32 x float> [[TMP6]], [[TMP2]] +; COMBINE-NEXT: store <32 x float> [[TMP7]], ptr [[IN2:%.*]], align 16 ; COMBINE-NEXT: [[GEP10:%.*]] = getelementptr inbounds i8, ptr [[IN1]], i64 32 -; COMBINE-NEXT: [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2:%.*]], i64 128 -; COMBINE-NEXT: [[TMP8:%.*]] = load <8 x float>, ptr [[IN0]], align 16 -; COMBINE-NEXT: store <32 x float> [[TMP7]], ptr [[IN2]], align 16 ; COMBINE-NEXT: [[LOAD5:%.*]] = load <16 x i8>, ptr [[GEP10]], align 1 -; COMBINE-NEXT: [[TMP14:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> +; COMBINE-NEXT: [[TMP20:%.*]] = shufflevector <32 x float> [[TMP6]], <32 x float> poison, <4 x i32> +; COMBINE-NEXT: [[TMP23:%.*]] = shufflevector <32 x float> [[TMP6]], <32 x float> poison, <4 x i32> +; COMBINE-NEXT: [[GEP11:%.*]] = getelementptr inbounds i8, ptr [[IN2]], i64 128 +; COMBINE-NEXT: [[TMP10:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <32 x i32> ; COMBINE-NEXT: [[TMP24:%.*]] = shufflevector <16 x i8> [[LOAD5]], <16 x i8> poison, <16 x i32> ; COMBINE-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i16> -; COMBINE-NEXT: [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <32 x i32> ; COMBINE-NEXT: [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP25]], <16 x i16> poison, <16 x i32> ; COMBINE-NEXT: [[TMP9:%.*]] = uitofp <16 x i16> [[TMP18]] to <16 x float> -; COMBINE-NEXT: [[TMP20:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> ; COMBINE-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> poison, <16 x i32> -; COMBINE-NEXT: [[TMP22:%.*]] = shufflevector <16 x float> [[TMP13]], <16 x float> [[TMP21]], <16 x i32> -; COMBINE-NEXT: [[TMP23:%.*]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <4 x i32> +; COMBINE-NEXT: [[TMP26:%.*]] = shufflevector <4 x float> [[LOAD2]], <4 x float> poison, <16 x i32> +; COMBINE-NEXT: [[TMP22:%.*]] = shufflevector <16 x float> [[TMP21]], <16 x float> [[TMP26]], <16 x i32> ; COMBINE-NEXT: [[TMP27:%.*]] = shufflevector <4 x float> [[TMP23]], <4 x float> poison, <16 x i32> ; COMBINE-NEXT: [[TMP15:%.*]] = shufflevector <16 x float> [[TMP22]], <16 x float> [[TMP27]], <16 x i32> ; COMBINE-NEXT: [[TMP16:%.*]] = shufflevector <16 x float> [[TMP15]], <16 x float> poison, <16 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/revec.ll b/llvm/test/Transforms/SLPVectorizer/revec.ll index df42cba7c8d45..4579e0f98c37e 100644 --- a/llvm/test/Transforms/SLPVectorizer/revec.ll +++ b/llvm/test/Transforms/SLPVectorizer/revec.ll @@ -224,8 +224,8 @@ define void @test8() { ; CHECK-NEXT: [[TMP0:%.*]] = phi <8 x float> [ zeroinitializer, [[ENTRY:%.*]] ], [ [[TMP8:%.*]], [[FOR_BODY]] ] ; CHECK-NEXT: ret void ; CHECK: for.body: -; CHECK-NEXT: [[TMP7:%.*]] = phi <4 x float> [ [[TMP7]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ] -; CHECK-NEXT: [[TMP8]] = shufflevector <4 x float> [[TMP7]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP8]] = phi <8 x float> [ [[TMP2:%.*]], [[FOR_BODY]] ], [ zeroinitializer, [[ENTRY]] ] +; CHECK-NEXT: [[TMP2]] = shufflevector <8 x float> [[TMP8]], <8 x float> poison, <8 x i32> ; CHECK-NEXT: br i1 false, label [[FOR0]], label [[FOR_BODY]] ; entry: @@ -268,10 +268,9 @@ define void @test10() { ; CHECK-LABEL: @test10( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <32 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <32 x i32> ; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP0]], <16 x i8> poison, <16 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = sext <16 x i8> [[TMP4]] to <16 x i16> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> poison, <32 x i32> ; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: [[TMP8:%.*]] = trunc <16 x i16> [[TMP7]] to <16 x i8> ; CHECK-NEXT: [[TMP9:%.*]] = sext <16 x i8> [[TMP8]] to <16 x i32> @@ -378,7 +377,7 @@ entry: define void @test13(<8 x i32> %0, ptr %out0, ptr %out1, ptr %out2) { ; CHECK-LABEL: @test13( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0:%.*]], <8 x i32> poison, <32 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0:%.*]], <8 x i32> poison, <32 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <16 x i32> ; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] ; CHECK: for.end.loopexit: @@ -414,10 +413,9 @@ for.end.loopexit: define void @test14(<8 x i1> %0) { ; CHECK-LABEL: @test14( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i1> [[TMP0:%.*]], <8 x i1> poison, <16 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = sext <16 x i1> [[TMP2]] to <16 x i16> -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <32 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i1> [[TMP0:%.*]], <8 x i1> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = sext <16 x i1> [[TMP1]] to <16 x i16> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> poison, <16 x i32> ; CHECK-NEXT: br label [[FOR_END_LOOPEXIT:%.*]] ; CHECK: for.end.loopexit: ; CHECK-NEXT: [[TMP6:%.*]] = phi <16 x i16> [ [[TMP5]], [[ENTRY:%.*]] ] diff --git a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll index cf5927bf58327..4b110143ec1e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll +++ b/llvm/test/Transforms/SLPVectorizer/shuffle-mask-resized.ll @@ -1,22 +1,37 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 -; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s %} -; RUN: %if aaarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s %} +; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s --check-prefix=X86 %} +; RUN: %if aaarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux -slp-threshold=-10 < %s | FileCheck %s --check-prefix=AARCH64 %} define i32 @test() { -; CHECK-LABEL: define i32 @test() { -; CHECK-NEXT: bb: -; CHECK-NEXT: br label [[BB1:%.*]] -; CHECK: bb1: -; CHECK-NEXT: [[TMP0:%.*]] = phi <2 x i32> [ [[TMP5:%.*]], [[BB3:%.*]] ], [ zeroinitializer, [[BB:%.*]] ] -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: br i1 false, label [[BB4:%.*]], label [[BB3]] -; CHECK: bb3: -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> , <2 x i32> -; CHECK-NEXT: [[TMP5]] = add <2 x i32> zeroinitializer, [[TMP2]] -; CHECK-NEXT: br label [[BB1]] -; CHECK: bb4: -; CHECK-NEXT: [[TMP6:%.*]] = phi <8 x i32> [ [[TMP1]], [[BB1]] ] -; CHECK-NEXT: ret i32 0 +; X86-LABEL: define i32 @test() { +; X86-NEXT: bb: +; X86-NEXT: br label [[BB1:%.*]] +; X86: bb1: +; X86-NEXT: [[TMP0:%.*]] = phi <8 x i32> [ [[TMP4:%.*]], [[BB3:%.*]] ], [ zeroinitializer, [[BB:%.*]] ] +; X86-NEXT: br i1 false, label [[BB4:%.*]], label [[BB3]] +; X86: bb3: +; X86-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <2 x i32> +; X86-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> , <2 x i32> +; X86-NEXT: [[TMP3:%.*]] = add <2 x i32> zeroinitializer, [[TMP2]] +; X86-NEXT: [[TMP4]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> poison, <8 x i32> +; X86-NEXT: br label [[BB1]] +; X86: bb4: +; X86-NEXT: [[TMP5:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ] +; X86-NEXT: ret i32 0 +; +; AARCH64-LABEL: define i32 @test() { +; AARCH64-NEXT: bb: +; AARCH64-NEXT: br label [[BB1:%.*]] +; AARCH64: bb1: +; AARCH64-NEXT: [[TMP0:%.*]] = phi <8 x i32> [ [[TMP2:%.*]], [[BB3:%.*]] ], [ zeroinitializer, [[BB:%.*]] ] +; AARCH64-NEXT: br i1 false, label [[BB4:%.*]], label [[BB3]] +; AARCH64: bb3: +; AARCH64-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> , <8 x i32> +; AARCH64-NEXT: [[TMP2]] = add <8 x i32> zeroinitializer, [[TMP1]] +; AARCH64-NEXT: br label [[BB1]] +; AARCH64: bb4: +; AARCH64-NEXT: [[TMP3:%.*]] = phi <8 x i32> [ [[TMP0]], [[BB1]] ] +; AARCH64-NEXT: ret i32 0 ; bb: br label %bb1