diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index b2124c6106198..feb0af61b15c2 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -800,6 +800,12 @@ class TargetTransformInfo { /// Return true if the target supports strided load. bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const; + /// Return true is the target supports interleaved access for the given vector + /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and + /// address space \p AddrSpace. + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, unsigned AddrSpace) const; + // Return true if the target supports masked vector histograms. bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const; @@ -1906,6 +1912,10 @@ class TargetTransformInfo::Concept { virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0; + virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, + unsigned AddrSpace) = 0; + virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0; virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, @@ -2417,6 +2427,12 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override { return Impl.isLegalStridedLoadStore(DataType, Alignment); } + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, + unsigned AddrSpace) override { + return Impl.isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace); + } + bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override { return Impl.isLegalMaskedVectorHistogram(AddrType, DataType); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 11b07ac0b7fc4..2ed9a1d583d89 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -321,6 +321,11 @@ class TargetTransformInfoImplBase { return false; } + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, unsigned AddrSpace) { + return false; + } + bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 2c26493bd3f1c..be8bf762e12e1 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -517,6 +517,13 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType, return TTIImpl->isLegalStridedLoadStore(DataType, Alignment); } +bool TargetTransformInfo::isLegalInterleavedAccessType( + VectorType *VTy, unsigned Factor, Align Alignment, + unsigned AddrSpace) const { + return TTIImpl->isLegalInterleavedAccessType(VTy, Factor, Alignment, + AddrSpace); +} + bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const { return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index cc69e1d118b5a..1ce80fe8ed7e1 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -281,6 +281,12 @@ class RISCVTTIImpl : public BasicTTIImplBase { return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment); } + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, unsigned AddrSpace) { + return TLI->isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace, + DL); + } + bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment); bool isVScaleKnownToBeAPowerOfTwo() const { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 949579772b94d..c07a39fe0dbc6 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -1336,6 +1336,8 @@ class BoUpSLP { MustGather.clear(); NonScheduledFirst.clear(); EntryToLastInstruction.clear(); + LoadEntriesToVectorize.clear(); + GatheredLoadsEntriesFirst = NoGatheredLoads; ExternalUses.clear(); ExternalUsesAsOriginalScalar.clear(); for (auto &Iter : BlocksSchedules) { @@ -1352,7 +1354,11 @@ class BoUpSLP { ValueToGatherNodes.clear(); } - unsigned getTreeSize() const { return VectorizableTree.size(); } + unsigned getTreeSize() const { + return GatheredLoadsEntriesFirst == NoGatheredLoads + ? VectorizableTree.size() + : GatheredLoadsEntriesFirst; + } /// Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -1460,11 +1466,14 @@ class BoUpSLP { /// \param VL0 main load value. /// \param Order returned order of load instructions. /// \param PointerOps returned list of pointer operands. + /// \param BestVF return best vector factor, if recursive check found better + /// vectorization sequences rather than masked gather. /// \param TryRecursiveCheck used to check if long masked gather can be /// represented as a serie of loads/insert subvector, if profitable. LoadsState canVectorizeLoads(ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, SmallVectorImpl &PointerOps, + unsigned *BestVF = nullptr, bool TryRecursiveCheck = true) const; OptimizationRemarkEmitter *getORE() { return ORE; } @@ -2827,7 +2836,7 @@ class BoUpSLP { /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, - const EdgeInfo &EI); + const EdgeInfo &EI, unsigned InterleaveFactor = 0); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a @@ -2939,6 +2948,12 @@ class BoUpSLP { /// be beneficial even the tree height is tiny. bool isFullyVectorizableTinyTree(bool ForReduction) const; + /// Run through the list of all gathered loads in the graph and try to find + /// vector loads/masked gathers instead of regular gathers. Later these loads + /// are reshufled to build final gathered nodes. + void tryToVectorizeGatheredLoads( + ArrayRef>> GatheredLoads); + /// Reorder commutative or alt operands to get better probability of /// generating vectorized code. static void reorderInputsAccordingToOpcode(ArrayRef VL, @@ -3011,7 +3026,8 @@ class BoUpSLP { } bool isOperandGatherNode(const EdgeInfo &UserEI) const { - return isGather() && UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && + return isGather() && !UserTreeIndices.empty() && + UserTreeIndices.front().EdgeIdx == UserEI.EdgeIdx && UserTreeIndices.front().UserTE == UserEI.UserTE; } @@ -3115,7 +3131,19 @@ class BoUpSLP { Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; + /// Interleaving factor for interleaved loads Vectorize nodes. + unsigned InterleaveFactor = 0; + public: + /// Returns interleave factor for interleave nodes. + std::optional getInterleaveFactor() const { + if (InterleaveFactor > 0) + return InterleaveFactor; + return std::nullopt; + } + /// Sets interleaving factor for the interleaving nodes. + void setInterleave(unsigned Factor) { InterleaveFactor = Factor; } + /// Set this bundle's \p OpIdx'th operand to \p OpVL. void setOperand(unsigned OpIdx, ArrayRef OpVL) { if (Operands.size() < OpIdx + 1) @@ -3260,7 +3288,12 @@ class BoUpSLP { dbgs() << "State: "; switch (State) { case Vectorize: - dbgs() << "Vectorize\n"; + if (InterleaveFactor > 0) { + dbgs() << "Vectorize with interleave factor " << InterleaveFactor + << "\n"; + } else { + dbgs() << "Vectorize\n"; + } break; case ScatterVectorize: dbgs() << "ScatterVectorize\n"; @@ -3330,11 +3363,15 @@ class BoUpSLP { const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = std::nullopt, - ArrayRef ReorderIndices = std::nullopt) { + ArrayRef ReorderIndices = std::nullopt, + unsigned InterleaveFactor = 0) { TreeEntry::EntryState EntryState = Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; - return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, - ReuseShuffleIndices, ReorderIndices); + TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, + ReuseShuffleIndices, ReorderIndices); + if (E && InterleaveFactor) + E->setInterleave(InterleaveFactor); + return E; } TreeEntry *newTreeEntry(ArrayRef VL, @@ -3347,6 +3384,12 @@ class BoUpSLP { assert(((!Bundle && EntryState == TreeEntry::NeedToGather) || (Bundle && EntryState != TreeEntry::NeedToGather)) && "Need to vectorize gather entry?"); + // Gathered loads still gathered? Do not create entry, use the original one. + if (GatheredLoadsEntriesFirst != NoGatheredLoads && + EntryState == TreeEntry::NeedToGather && + S.getOpcode() == Instruction::Load && UserTreeIdx.EdgeIdx == UINT_MAX && + !UserTreeIdx.UserTE) + return nullptr; VectorizableTree.push_back(std::make_unique(VectorizableTree)); TreeEntry *Last = VectorizableTree.back().get(); Last->Idx = VectorizableTree.size() - 1; @@ -3456,7 +3499,7 @@ class BoUpSLP { /// and fills required data before actual scheduling of the instructions. TreeEntry::EntryState getScalarsVectorizationState( InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, - OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) const; + OrdersType &CurrentOrder, SmallVectorImpl &PointerOps); /// Maps a specific scalar to its tree entry. SmallDenseMap ScalarToTreeEntry; @@ -3491,6 +3534,14 @@ class BoUpSLP { DenseMap>; ValueToGatherNodesMap ValueToGatherNodes; + /// A list of the loads, which can be vectorized using strided or masked + /// gather approach, but attempted to be represented as interleaved loads. + SetVector LoadEntriesToVectorize; + + /// The index of the first gathered load entry in the VectorizeTree. + constexpr static int NoGatheredLoads = -1; + int GatheredLoadsEntriesFirst = NoGatheredLoads; + /// This POD struct describes one external user in the vectorized tree. struct ExternalUser { ExternalUser(Value *S, llvm::User *U, int L) @@ -4662,15 +4713,19 @@ getShuffleCost(const TargetTransformInfo &TTI, TTI::ShuffleKind Kind, return TTI.getShuffleCost(Kind, Tp, Mask, CostKind, Index, SubTp, Args); } -BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( - ArrayRef VL, const Value *VL0, SmallVectorImpl &Order, - SmallVectorImpl &PointerOps, bool TryRecursiveCheck) const { +BoUpSLP::LoadsState +BoUpSLP::canVectorizeLoads(ArrayRef VL, const Value *VL0, + SmallVectorImpl &Order, + SmallVectorImpl &PointerOps, + unsigned *BestVF, bool TryRecursiveCheck) const { // Check that a vectorized load would load the same memory as a scalar // load. For example, we don't want to vectorize loads that are smaller // than 8-bit. Even though we have a packed struct {} LLVM // treats loading/storing it as an i8 struct. If we vectorize loads/stores // from such a struct, we read/write packed bits disagreeing with the // unvectorized version. + if (BestVF) + *BestVF = 0; Type *ScalarTy = VL0->getType(); if (DL->getTypeSizeInBits(ScalarTy) != DL->getTypeAllocSizeInBits(ScalarTy)) @@ -4780,25 +4835,91 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( } } } - auto CheckForShuffledLoads = [&, &TTI = *TTI](Align CommonAlignment) { + // Correctly identify compare the cost of loads + shuffles rather than + // strided/masked gather loads. Returns true if vectorized + shuffles + // representation is better than just gather. + auto CheckForShuffledLoads = [&, &TTI = *TTI]( + Align CommonAlignment, unsigned *BestVF, + bool ProfitableGatherPointers) { + // Compare masked gather cost and loads + insert subvector costs. + if (BestVF) + *BestVF = 0; + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + auto [ScalarGEPCost, VectorGEPCost] = + getGEPCosts(TTI, PointerOps, PointerOps.front(), + Instruction::GetElementPtr, CostKind, ScalarTy, VecTy); + // Estimate the cost of masked gather GEP. If not a splat, roughly + // estimate as a buildvector, otherwise estimate as splat. + if (static_cast( + count_if(PointerOps, IsaPred)) < + PointerOps.size() - 1 || + any_of(PointerOps, [&](Value *V) { + return getUnderlyingObject(V) != + getUnderlyingObject(PointerOps.front()); + })) + VectorGEPCost += TTI.getScalarizationOverhead( + VecTy, + APInt::getAllOnes(VecTy->getElementCount().getKnownMinValue()), + /*Insert=*/true, /*Extract=*/false, CostKind); + else + VectorGEPCost += + TTI.getScalarizationOverhead( + VecTy, + APInt::getOneBitSet(VecTy->getElementCount().getKnownMinValue(), + 0), + /*Insert=*/true, /*Extract=*/false, CostKind) + + ::getShuffleCost(TTI, TTI::SK_Broadcast, VecTy, std::nullopt, + CostKind); + // The cost of scalar loads. + InstructionCost ScalarLoadsCost = + std::accumulate(VL.begin(), VL.end(), InstructionCost(), + [&](InstructionCost C, Value *V) { + return C + TTI.getInstructionCost( + cast(V), CostKind); + }) + + ScalarGEPCost; + // The cost of masked gather. + InstructionCost MaskedGatherCost = + TTI.getGatherScatterOpCost(Instruction::Load, VecTy, + cast(VL0)->getPointerOperand(), + /*VariableMask=*/false, CommonAlignment, + CostKind) + + (ProfitableGatherPointers ? 0 : VectorGEPCost); + APInt DemandedElts = APInt::getAllOnes(VecTy->getNumElements()); + InstructionCost GatherCost = + TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind) + + ScalarLoadsCost; + // The list of loads is small or perform partial check already - directly + // compare masked gather cost and gather cost. + constexpr unsigned ListLimit = 4; + if (!TryRecursiveCheck || VL.size() < ListLimit) + return MaskedGatherCost - GatherCost >= -SLPCostThreshold; unsigned Sz = DL->getTypeSizeInBits(ScalarTy); - unsigned MinVF = getMinVF(Sz); - unsigned MaxVF = std::max(bit_floor(VL.size() / 2), MinVF); + unsigned MinVF = 2; + unsigned MaxVF = bit_floor(VL.size() / 2); MaxVF = std::min(getMaximumVF(Sz, Instruction::Load), MaxVF); + DemandedElts.clearAllBits(); + // Iterate through possible vectorization factors and check if vectorized + // + shuffles is better than just gather. for (unsigned VF = MaxVF; VF >= MinVF; VF /= 2) { - unsigned VectorizedCnt = 0; SmallVector States; - for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; - Cnt += VF, ++VectorizedCnt) { + for (unsigned Cnt = 0, End = VL.size(); Cnt + VF <= End; Cnt += VF) { ArrayRef Slice = VL.slice(Cnt, VF); SmallVector Order; SmallVector PointerOps; LoadsState LS = - canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, + canVectorizeLoads(Slice, Slice.front(), Order, PointerOps, BestVF, /*TryRecursiveCheck=*/false); // Check that the sorted loads are consecutive. - if (LS == LoadsState::Gather) - break; + if (LS == LoadsState::Gather) { + if (BestVF) { + DemandedElts.setAllBits(); + break; + } + DemandedElts.setBits(Cnt, Cnt + VF); + continue; + } // If need the reorder - consider as high-cost masked gather for now. if ((LS == LoadsState::Vectorize || LS == LoadsState::StridedVectorize) && @@ -4806,79 +4927,97 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( LS = LoadsState::ScatterVectorize; States.push_back(LS); } + if (DemandedElts.isAllOnes()) + // All loads gathered - try smaller VF. + continue; + InstructionCost ScalarVFGEPCost = 0; // Can be vectorized later as a serie of loads/insertelements. - if (VectorizedCnt == VL.size() / VF) { - // Compare masked gather cost and loads + insersubvector costs. - TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; - auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts( - TTI, PointerOps, PointerOps.front(), Instruction::GetElementPtr, - CostKind, ScalarTy, VecTy); - InstructionCost MaskedGatherCost = - TTI.getGatherScatterOpCost( - Instruction::Load, VecTy, - cast(VL0)->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind) + - VectorGEPCost - ScalarGEPCost; - InstructionCost VecLdCost = 0; - auto *SubVecTy = getWidenedType(ScalarTy, VF); - for (auto [I, LS] : enumerate(States)) { - auto *LI0 = cast(VL[I * VF]); - switch (LS) { - case LoadsState::Vectorize: { - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), - LI0->getPointerOperand(), Instruction::Load, - CostKind, ScalarTy, SubVecTy); - VecLdCost += TTI.getMemoryOpCost( - Instruction::Load, SubVecTy, LI0->getAlign(), - LI0->getPointerAddressSpace(), CostKind, - TTI::OperandValueInfo()) + - VectorGEPCost - ScalarGEPCost; - break; - } - case LoadsState::StridedVectorize: { - auto [ScalarGEPCost, VectorGEPCost] = - getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), - LI0->getPointerOperand(), Instruction::Load, - CostKind, ScalarTy, SubVecTy); + InstructionCost VecLdCost = 0; + if (!DemandedElts.isZero()) { + VecLdCost = + TTI.getScalarizationOverhead(VecTy, DemandedElts, /*Insert=*/true, + /*Extract=*/false, CostKind) + + ScalarGEPCost; + for (unsigned Idx : seq(VL.size())) + if (DemandedElts[Idx]) VecLdCost += - TTI.getStridedMemoryOpCost( - Instruction::Load, SubVecTy, LI0->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind) + - VectorGEPCost - ScalarGEPCost; - break; - } - case LoadsState::ScatterVectorize: { - auto [ScalarGEPCost, VectorGEPCost] = getGEPCosts( - TTI, ArrayRef(PointerOps).slice(I * VF, VF), - LI0->getPointerOperand(), Instruction::GetElementPtr, - CostKind, ScalarTy, SubVecTy); - VecLdCost += - TTI.getGatherScatterOpCost( - Instruction::Load, SubVecTy, LI0->getPointerOperand(), - /*VariableMask=*/false, CommonAlignment, CostKind) + - VectorGEPCost - ScalarGEPCost; - break; - } - case LoadsState::Gather: - llvm_unreachable( - "Expected only consecutive, strided or masked gather loads."); - } - SmallVector ShuffleMask(VL.size()); - for (int Idx : seq(0, VL.size())) - ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; + TTI.getInstructionCost(cast(VL[Idx]), CostKind); + } + auto *SubVecTy = getWidenedType(ScalarTy, VF); + for (auto [I, LS] : enumerate(States)) { + auto *LI0 = cast(VL[I * VF]); + InstructionCost VectorGEPCost = + (LS == LoadsState::ScatterVectorize && ProfitableGatherPointers) + ? 0 + : getGEPCosts(TTI, ArrayRef(PointerOps).slice(I * VF, VF), + LI0->getPointerOperand(), + Instruction::GetElementPtr, CostKind, ScalarTy, + SubVecTy) + .second; + if (LS == LoadsState::ScatterVectorize) { + if (static_cast( + count_if(PointerOps, IsaPred)) < + PointerOps.size() - 1 || + any_of(PointerOps, [&](Value *V) { + return getUnderlyingObject(V) != + getUnderlyingObject(PointerOps.front()); + })) + VectorGEPCost += TTI.getScalarizationOverhead( + SubVecTy, APInt::getAllOnes(VF), + /*Insert=*/true, /*Extract=*/false, CostKind); + else + VectorGEPCost += + TTI.getScalarizationOverhead( + SubVecTy, APInt::getOneBitSet(VF, 0), + /*Insert=*/true, /*Extract=*/false, CostKind) + + ::getShuffleCost(TTI, TTI::SK_Broadcast, SubVecTy, + std::nullopt, CostKind); + } + switch (LS) { + case LoadsState::Vectorize: + VecLdCost += TTI.getMemoryOpCost( + Instruction::Load, SubVecTy, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind, + TTI::OperandValueInfo()) + + VectorGEPCost; + break; + case LoadsState::StridedVectorize: + VecLdCost += TTI.getStridedMemoryOpCost(Instruction::Load, SubVecTy, + LI0->getPointerOperand(), + /*VariableMask=*/false, + CommonAlignment, CostKind) + + VectorGEPCost; + break; + case LoadsState::ScatterVectorize: + VecLdCost += TTI.getGatherScatterOpCost(Instruction::Load, SubVecTy, + LI0->getPointerOperand(), + /*VariableMask=*/false, + CommonAlignment, CostKind) + + VectorGEPCost; + break; + case LoadsState::Gather: + // Gathers are already calculated - ignore. + continue; + } + SmallVector ShuffleMask(VL.size()); + for (int Idx : seq(0, VL.size())) + ShuffleMask[Idx] = Idx / VF == I ? VL.size() + Idx % VF : Idx; + if (I > 0) VecLdCost += ::getShuffleCost(TTI, TTI::SK_InsertSubvector, VecTy, ShuffleMask, CostKind, I * VF, SubVecTy); - } - // If masked gather cost is higher - better to vectorize, so - // consider it as a gather node. It will be better estimated - // later. - if (MaskedGatherCost >= VecLdCost) - return true; + } + // If masked gather cost is higher - better to vectorize, so + // consider it as a gather node. It will be better estimated + // later. + if (MaskedGatherCost >= VecLdCost && + VecLdCost - GatherCost < -SLPCostThreshold) { + if (BestVF) + *BestVF = VF; + return true; } } - return false; + return MaskedGatherCost - GatherCost >= -SLPCostThreshold; }; // TODO: need to improve analysis of the pointers, if not all of them are // GEPs or have > 2 operands, we end up with a gather node, which just @@ -4900,7 +5039,8 @@ BoUpSLP::LoadsState BoUpSLP::canVectorizeLoads( !TTI->forceScalarizeMaskedGather(VecTy, CommonAlignment)) { // Check if potential masked gather can be represented as series // of loads + insertsubvectors. - if (TryRecursiveCheck && CheckForShuffledLoads(CommonAlignment)) { + if (CheckForShuffledLoads(CommonAlignment, BestVF, + ProfitableGatherPointers)) { // If masked gather cost is higher - better to vectorize, so // consider it as a gather node. It will be better estimated // later. @@ -5327,6 +5467,16 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) { if (TE.Scalars.size() >= 4) if (std::optional Order = findPartiallyOrderedLoads(TE)) return Order; + // Check if can include the order of vectorized loads. For masked gathers do + // extra analysis later, so include such nodes into a special list. + if (TE.isGather() && TE.getOpcode() == Instruction::Load) { + SmallVector PointerOps; + OrdersType CurrentOrder; + LoadsState Res = canVectorizeLoads(TE.Scalars, TE.Scalars.front(), + CurrentOrder, PointerOps); + if (Res == LoadsState::Vectorize || Res == LoadsState::StridedVectorize) + return std::move(CurrentOrder); + } if (std::optional CurrentOrder = findReusedOrderedScalars(TE)) return CurrentOrder; } @@ -6245,6 +6395,513 @@ void BoUpSLP::buildTree(ArrayRef Roots) { buildTree_rec(Roots, 0, EdgeInfo()); } +/// Tries to find subvector of loads and builds new vector of only loads if can +/// be profitable. +static void gatherPossiblyVectorizableLoads( + const BoUpSLP &R, ArrayRef VL, const DataLayout &DL, + ScalarEvolution &SE, const TargetTransformInfo &TTI, + SmallVectorImpl>> &GatheredLoads, + bool AddNew = true) { + if (VL.empty()) + return; + if (!isValidElementType(VL.front()->getType())) + return; + Type *ScalarTy = VL.front()->getType(); + int NumScalars = VL.size(); + auto *VecTy = getWidenedType(ScalarTy, NumScalars); + int NumParts = TTI.getNumberOfParts(VecTy); + if (NumParts == 0 || NumParts >= NumScalars) + NumParts = 1; + unsigned VF = PowerOf2Ceil(NumScalars / NumParts); + SmallVector>> ClusteredLoads; + for (int I : seq(0, NumParts)) { + for (Value *V : + VL.slice(I * VF, std::min(VF, VL.size() - I * VF))) { + auto *LI = dyn_cast(V); + if (!LI) + continue; + if (R.isDeleted(LI) || R.isVectorized(LI) || !LI->isSimple()) + continue; + bool IsFound = false; + for (auto &Data : ClusteredLoads) { + if (LI->getParent() != Data.front().first->getParent()) + continue; + std::optional Dist = + getPointersDiff(LI->getType(), LI->getPointerOperand(), + Data.front().first->getType(), + Data.front().first->getPointerOperand(), DL, SE, + /*StrictCheck=*/true); + if (Dist && all_of(Data, [&](const std::pair &Pair) { + IsFound |= Pair.first == LI; + return IsFound || Pair.second != *Dist; + })) { + if (!IsFound) + Data.emplace_back(LI, *Dist); + IsFound = true; + break; + } + } + if (!IsFound) + ClusteredLoads.emplace_back().emplace_back(LI, 0); + } + } + auto FindMatchingLoads = + [&](ArrayRef> Loads, + SmallVectorImpl>> + &GatheredLoads, + SetVector &ToAdd, SetVector &Repeated, + int &Offset, unsigned &Start) { + SmallVector> Res; + if (Loads.empty()) + return GatheredLoads.end(); + LoadInst *LI = Loads.front().first; + for (auto [Idx, Data] : enumerate(GatheredLoads)) { + if (Idx < Start) + continue; + ToAdd.clear(); + if (LI->getParent() != Data.front().first->getParent()) + continue; + std::optional Dist = + getPointersDiff(LI->getType(), LI->getPointerOperand(), + Data.front().first->getType(), + Data.front().first->getPointerOperand(), DL, SE, + /*StrictCheck=*/true); + if (Dist) { + // Found matching gathered loads - check if all loads are unique or + // can be effectively vectorized. + unsigned NumUniques = 0; + for (auto [Cnt, Pair] : enumerate(Loads)) { + bool Used = any_of( + Data, [&, &P = Pair](const std::pair &PD) { + return PD.first == P.first; + }); + if (none_of(Data, + [&, &P = Pair](const std::pair &PD) { + return *Dist + P.second == PD.second; + }) && + !Used) { + ++NumUniques; + ToAdd.insert(Cnt); + } + if (Used) + Repeated.insert(Cnt); + } + if (NumUniques > 0 && + (Loads.size() == NumUniques || + (Loads.size() - NumUniques >= 2 && + Loads.size() - NumUniques >= Loads.size() / 2 && + (isPowerOf2_64(Data.size() + NumUniques) || + PowerOf2Ceil(Data.size()) < + PowerOf2Ceil(Data.size() + NumUniques))))) { + Offset = *Dist; + Start = Idx + 1; + return std::next(GatheredLoads.begin(), Idx); + } + } + } + ToAdd.clear(); + return GatheredLoads.end(); + }; + for (ArrayRef> Data : ClusteredLoads) { + unsigned Start = 0; + SetVector ToAdd, LocalToAdd, Repeated; + int Offset = 0; + auto *It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, + Offset, Start); + while (It != GatheredLoads.end()) { + assert(!LocalToAdd.empty() && "Expected some elements to add."); + for (unsigned Idx : LocalToAdd) + It->emplace_back(Data[Idx].first, Data[Idx].second + Offset); + ToAdd.insert(LocalToAdd.begin(), LocalToAdd.end()); + It = FindMatchingLoads(Data, GatheredLoads, LocalToAdd, Repeated, Offset, + Start); + } + if (any_of(seq(Data.size()), [&](unsigned Idx) { + return !ToAdd.contains(Idx) && !Repeated.contains(Idx); + })) { + auto AddNewLoads = + [&](SmallVectorImpl> &Loads) { + for (unsigned Idx : seq(Data.size())) { + if (ToAdd.contains(Idx) || Repeated.contains(Idx)) + continue; + Loads.push_back(Data[Idx]); + } + }; + if (!AddNew) { + LoadInst *LI = Data.front().first; + It = find_if( + GatheredLoads, [&](ArrayRef> PD) { + return PD.front().first->getParent() == LI->getParent() && + PD.front().first->getType() == LI->getType(); + }); + while (It != GatheredLoads.end()) { + AddNewLoads(*It); + It = std::find_if( + std::next(It), GatheredLoads.end(), + [&](ArrayRef> PD) { + return PD.front().first->getParent() == LI->getParent() && + PD.front().first->getType() == LI->getType(); + }); + } + } + GatheredLoads.emplace_back().append(Data.begin(), Data.end()); + AddNewLoads(GatheredLoads.emplace_back()); + } + } +} + +void BoUpSLP::tryToVectorizeGatheredLoads( + ArrayRef>> GatheredLoads) { + GatheredLoadsEntriesFirst = VectorizableTree.size(); + + SmallVector> LoadSetsToVectorize( + LoadEntriesToVectorize.size()); + for (auto [Idx, Set] : zip(LoadEntriesToVectorize, LoadSetsToVectorize)) + Set.insert(VectorizableTree[Idx]->Scalars.begin(), + VectorizableTree[Idx]->Scalars.end()); + + // Sort loads by distance. + auto LoadSorter = [](const std::pair &L1, + const std::pair &L2) { + return L1.second > L2.second; + }; + + auto GetVectorizedRanges = [this]( + ArrayRef Loads, + BoUpSLP::ValueSet &VectorizedLoads, + SmallVectorImpl &NonVectorized) { + SmallVector, LoadsState>> Results; + unsigned StartIdx = 0; + SmallVector CandidateVFs; + if (VectorizeNonPowerOf2 && isPowerOf2_32(Loads.size() + 1)) + CandidateVFs.push_back(Loads.size()); + for (int NumElts = bit_floor(Loads.size()); NumElts > 1; NumElts /= 2) { + CandidateVFs.push_back(NumElts); + if (VectorizeNonPowerOf2 && NumElts > 2) + CandidateVFs.push_back(NumElts - 1); + } + + for (int NumElts : CandidateVFs) { + SmallVector MaskedGatherVectorized; + for (unsigned Cnt = StartIdx, E = Loads.size(); Cnt + NumElts <= E; + ++Cnt) { + ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); + if (VectorizedLoads.count(Slice.front()) || + VectorizedLoads.count(Slice.back())) + continue; + // Check if it is profitable to try vectorizing gathered loads. It is + // profitable if we have more than 3 consecutive loads or if we have + // less but all users are vectorized or deleted. + bool AllowToVectorize = + NumElts >= 3 || + any_of(VectorizableTree, [=](const std::unique_ptr &TE) { + return TE->isGather() && TE->Scalars.size() == 2 && + (equal(TE->Scalars, Slice) || + equal(TE->Scalars, reverse(Slice))); + }); + // Check if it is profitable to vectorize 2-elements loads. + if (NumElts == 2) { + bool IsLegalBroadcastLoad = TTI->isLegalBroadcastLoad( + Slice.front()->getType(), ElementCount::getFixed(NumElts)); + auto CheckIfAllowed = [=](ArrayRef Slice) { + for (LoadInst *LI : Slice) { + // If single use/user - allow to vectorize. + if (LI->hasOneUse()) + continue; + // 1. Check if number of uses equal number of users. + // 2. All users are deleted. + // 3. The load broadcasts are not allowed or the load is not + // broadcasted. + if (std::distance(LI->user_begin(), LI->user_end()) != + LI->getNumUses()) + return false; + for (User *U : LI->users()) { + if (auto *UI = dyn_cast(U); UI && isDeleted(UI)) + continue; + if (const TreeEntry *UTE = getTreeEntry(U)) { + if (!IsLegalBroadcastLoad) + // The broadcast is illegal - vectorize loads. + continue; + for (int I = 0, End = UTE->getNumOperands(); I < End; ++I) { + if (all_of(UTE->getOperand(I), + [LI](Value *V) { return V == LI; })) + // Found legal broadcast - do not vectorize. + return false; + } + } + } + } + return true; + }; + AllowToVectorize = CheckIfAllowed(Slice); + } + if (AllowToVectorize) { + SmallVector PointerOps; + OrdersType CurrentOrder; + // Try to build vector load. + ArrayRef Values( + reinterpret_cast(Slice.begin()), Slice.size()); + unsigned BestVF = 0; + LoadsState LS = canVectorizeLoads(Values, Slice.front(), CurrentOrder, + PointerOps, &BestVF); + if (LS != LoadsState::Gather || + (BestVF > 1 && static_cast(NumElts) == 2 * BestVF)) { + if (LS == LoadsState::ScatterVectorize) { + if (MaskedGatherVectorized.empty() || + Cnt >= MaskedGatherVectorized.back() + NumElts) + MaskedGatherVectorized.push_back(Cnt); + continue; + } + if (LS != LoadsState::Gather) { + Results.emplace_back(Values, LS); + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it + // again. + if (Cnt == StartIdx) + StartIdx += NumElts; + } + // Erase last masked gather candidate, if another candidate within + // the range is found to be better. + if (!MaskedGatherVectorized.empty() && + Cnt < MaskedGatherVectorized.back() + NumElts) + MaskedGatherVectorized.pop_back(); + Cnt += NumElts - 1; + continue; + } + } + // Check if the whole array was vectorized already - exit. + if (StartIdx >= Loads.size()) + break; + } + // Mark masked gathers candidates as vectorized, if any. + for (unsigned Cnt : MaskedGatherVectorized) { + ArrayRef Slice = ArrayRef(Loads).slice(Cnt, NumElts); + ArrayRef Values( + reinterpret_cast(Slice.begin()), Slice.size()); + Results.emplace_back(Values, LoadsState::ScatterVectorize); + VectorizedLoads.insert(Slice.begin(), Slice.end()); + // If we vectorized initial block, no need to try to vectorize it again. + if (Cnt == StartIdx) + StartIdx += NumElts; + } + } + for (LoadInst *LI : Loads) { + if (!VectorizedLoads.contains(LI)) + NonVectorized.push_back(LI); + } + return Results; + }; + auto ProcessGatheredLoads = [&](ArrayRef< + SmallVector>> + GatheredLoads) { + SmallVector NonVectorized; + for (ArrayRef> LoadsDists : GatheredLoads) { + SmallVector> LocalLoadsDists(LoadsDists); + SmallVector OriginalLoads(LocalLoadsDists.size()); + transform(LoadsDists, OriginalLoads.begin(), + [](const std::pair &L) { return L.first; }); + stable_sort(LocalLoadsDists, LoadSorter); + SmallVector Loads; + for (const std::pair &L : LocalLoadsDists) { + if (!getTreeEntry(L.first)) + Loads.push_back(L.first); + } + if (Loads.empty()) + continue; + BoUpSLP::ValueSet VectorizedLoads; + SmallVector SortedNonVectorized; + SmallVector, LoadsState>> Results = + GetVectorizedRanges(Loads, VectorizedLoads, SortedNonVectorized); + if (!Results.empty() && !SortedNonVectorized.empty() && + all_of(Results, + [](const std::pair, LoadsState> &P) { + return P.second == LoadsState::ScatterVectorize; + })) { + VectorizedLoads.clear(); + SmallVector UnsortedNonVectorized; + SmallVector, LoadsState>> UnsortedResults = + GetVectorizedRanges(OriginalLoads, VectorizedLoads, + UnsortedNonVectorized); + if (SortedNonVectorized.size() >= UnsortedNonVectorized.size()) { + SortedNonVectorized.swap(UnsortedNonVectorized); + Results.swap(UnsortedResults); + } + } + for (auto [Slice, _] : Results) { + LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize gathered loads (" + << Slice.size() << ")\n"); + if (any_of(Slice, [&](Value *V) { return getTreeEntry(V); })) { + for (Value *L : Slice) + if (!getTreeEntry(L)) + SortedNonVectorized.push_back(cast(L)); + continue; + } + + // Select maximum VF as a maximum of user gathered nodes and + // distance between scalar loads in these nodes. + unsigned MaxVF = Slice.size(); + unsigned UserMaxVF = 0; + std::optional SegmentedLoadsDistance = 0; + std::optional CommonVF = 0; + unsigned Order = 0; + DenseMap EntryToPosition; + SmallPtrSet DeinterleavedNodes; + for (auto [Idx, V] : enumerate(Slice)) { + for (const TreeEntry *E : ValueToGatherNodes.at(V)) { + UserMaxVF = std::max(UserMaxVF, E->Scalars.size()); + unsigned Pos = EntryToPosition.try_emplace(E, Idx).first->second; + UserMaxVF = std::max(UserMaxVF, Idx - Pos + 1); + if (CommonVF) { + if (*CommonVF == 0) { + CommonVF = E->Scalars.size(); + continue; + } + if (*CommonVF != E->Scalars.size()) + CommonVF.reset(); + } + if (Pos != Idx && SegmentedLoadsDistance) { + if (!DeinterleavedNodes.contains(E) && + any_of(E->Scalars, [&, Slice = Slice](Value *V) { + if (isa(V)) + return false; + if (getTreeEntry(V)) + return true; + const auto &Nodes = ValueToGatherNodes.at(V); + return (Nodes.size() != 1 || !Nodes.contains(E)) && + !is_contained(Slice, V); + })) { + SegmentedLoadsDistance.reset(); + continue; + } + DeinterleavedNodes.insert(E); + if (*SegmentedLoadsDistance == 0) { + SegmentedLoadsDistance = Idx - Pos; + continue; + } + if ((Idx - Pos) % *SegmentedLoadsDistance != 0 || + (Idx - Pos) / *SegmentedLoadsDistance < Order) + SegmentedLoadsDistance.reset(); + Order = (Idx - Pos) / SegmentedLoadsDistance.value_or(1); + } + } + } + DeinterleavedNodes.clear(); + unsigned InterleaveFactor = 0; + // Check if the large load represents interleaved load operation. + if (SegmentedLoadsDistance.value_or(0) > 1 && + CommonVF.value_or(0) != 0) { + InterleaveFactor = PowerOf2Ceil(*SegmentedLoadsDistance); + unsigned VF = *CommonVF; + OrdersType Order; + SmallVector PointerOps; + // Segmented load detected - vectorize at maximum vector factor. + if (TTI->isLegalInterleavedAccessType( + getWidenedType(Slice.front()->getType(), VF), + InterleaveFactor, cast(Slice.front())->getAlign(), + cast(Slice.front())->getPointerAddressSpace()) && + canVectorizeLoads(Slice, Slice.front(), Order, PointerOps) == + LoadsState::Vectorize) { + UserMaxVF = InterleaveFactor * VF; + } else { + UserMaxVF = VF; + InterleaveFactor = 0; + } + } + // Try to build long masked gather loads. + UserMaxVF = PowerOf2Ceil(UserMaxVF); + if (InterleaveFactor == 0 && + any_of(seq(Slice.size() / UserMaxVF), + [&, Slice = Slice](unsigned Idx) { + OrdersType Order; + SmallVector PointerOps; + return canVectorizeLoads( + Slice.slice(Idx * UserMaxVF, UserMaxVF), + Slice[Idx * UserMaxVF], Order, + PointerOps) == LoadsState::ScatterVectorize; + })) + UserMaxVF = MaxVF; + // Cannot represent the loads as consecutive vectorizable nodes - + // just exit. + unsigned ConsecutiveNodesSize = 0; + if (!LoadEntriesToVectorize.empty() && + (SegmentedLoadsDistance.value_or(0) == 0 || + CommonVF.value_or(UserMaxVF) == UserMaxVF) && + any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize), + [&, Slice = Slice](const auto &P) { + const auto *It = find_if(Slice, [&](Value *V) { + return std::get<1>(P).contains(V); + }); + if (It == Slice.end()) + return false; + ArrayRef VL = + VectorizableTree[std::get<0>(P)]->Scalars; + ConsecutiveNodesSize += VL.size(); + unsigned Start = std::distance(Slice.begin(), It); + unsigned Sz = Slice.size() - Start; + return Sz < VL.size() || + Slice.slice(std::distance(Slice.begin(), It), + VL.size()) != VL; + })) + continue; + if (Slice.size() != ConsecutiveNodesSize) + MaxVF = std::min(MaxVF, UserMaxVF); + for (unsigned VF = MaxVF; VF >= 2; VF /= 2) { + bool IsVectorized = true; + for (unsigned I = 0, E = Slice.size(); I < E; I += VF) { + ArrayRef SubSlice = Slice.slice(I, std::min(VF, E - I)); + if (getTreeEntry(SubSlice.front())) + continue; + // Check if the subslice is to be-vectorized entry, which is not + // equal to entry. + if (any_of( + zip(LoadEntriesToVectorize, LoadSetsToVectorize), + [&](const auto &P) { + return !SubSlice.equals( + VectorizableTree[std::get<0>(P)]->Scalars) && + set_is_subset(SubSlice, std::get<1>(P)); + })) + continue; + unsigned Sz = VectorizableTree.size(); + buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor); + if (Sz == VectorizableTree.size()) { + IsVectorized = false; + // Try non-interleaved vectorization with smaller vector factor. + if (InterleaveFactor > 0) { + VF = 2 * (MaxVF / InterleaveFactor); + InterleaveFactor = 0; + } + continue; + } + } + if (IsVectorized) + break; + } + } + NonVectorized.append(SortedNonVectorized); + } + return NonVectorized; + }; + SmallVector NonVectorized = ProcessGatheredLoads(GatheredLoads); + SmallVector>> FinalGatheredLoads; + for (LoadInst *LI : NonVectorized) { + // Reinsert non-vectorized loads to other list of loads with the same + // base pointers. + gatherPossiblyVectorizableLoads(*this, LI, *DL, *SE, *TTI, + FinalGatheredLoads, + /*AddNew=*/false); + } + // Final attempt to vectorize non-vectorized loads. + (void)ProcessGatheredLoads(FinalGatheredLoads); + // Try to vectorize postponed load entries, previously marked as gathered. + for (unsigned Idx : LoadEntriesToVectorize) + buildTree_rec(VectorizableTree[Idx]->Scalars, 0, EdgeInfo()); + // If no new entries created, consider it as no gathered loads entries must be + // handled. + if (static_cast(GatheredLoadsEntriesFirst) == + VectorizableTree.size()) + GatheredLoadsEntriesFirst = NoGatheredLoads; +} + /// \return true if the specified list of values has only one instruction that /// requires scheduling, false otherwise. #ifndef NDEBUG @@ -6466,7 +7123,7 @@ bool BoUpSLP::areAltOperandsProfitable(const InstructionsState &S, BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( InstructionsState &S, ArrayRef VL, bool IsScatterVectorizeUserTE, - OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) const { + OrdersType &CurrentOrder, SmallVectorImpl &PointerOps) { assert(S.MainOp && "Expected instructions with same/alternate opcodes only."); unsigned ShuffleOrOp = @@ -6543,8 +7200,20 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( case LoadsState::Vectorize: return TreeEntry::Vectorize; case LoadsState::ScatterVectorize: + if (GatheredLoadsEntriesFirst == NoGatheredLoads && + !VectorizableTree.empty()) { + // Delay slow vectorized nodes for better vectorization attempts. + LoadEntriesToVectorize.insert(VectorizableTree.size()); + return TreeEntry::NeedToGather; + } return TreeEntry::ScatterVectorize; case LoadsState::StridedVectorize: + if (GatheredLoadsEntriesFirst == NoGatheredLoads && + !VectorizableTree.empty()) { + // Delay slow vectorized nodes for better vectorization attempts. + LoadEntriesToVectorize.insert(VectorizableTree.size()); + return TreeEntry::NeedToGather; + } return TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG @@ -6877,7 +7546,8 @@ class PHIHandler { } // namespace void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) { + const EdgeInfo &UserTreeIdx, + unsigned InterleaveFactor) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); SmallVector ReuseShuffleIndices; @@ -7100,7 +7770,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, // Check if this is a duplicate of another entry. if (TreeEntry *E = getTreeEntry(S.OpValue)) { LLVM_DEBUG(dbgs() << "SLP: \tChecking bundle: " << *S.OpValue << ".\n"); - if (!E->isSame(VL)) { + if (GatheredLoadsEntriesFirst != NoGatheredLoads || !E->isSame(VL)) { auto It = MultiNodeScalars.find(S.OpValue); if (It != MultiNodeScalars.end()) { auto *TEIt = find_if(It->getSecond(), @@ -7352,7 +8022,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, switch (State) { case TreeEntry::Vectorize: TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndices, CurrentOrder); + ReuseShuffleIndices, CurrentOrder, InterleaveFactor); if (CurrentOrder.empty()) LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); else @@ -8380,7 +9050,8 @@ void BoUpSLP::transformNodes() { unsigned MinVF = getMinVF(2 * Sz); if (VL.size() <= 2 || (E.getOpcode() && - (E.isAltShuffle() || E.getOpcode() != Instruction::Load))) + (E.isAltShuffle() || E.getOpcode() != Instruction::Load || + LoadEntriesToVectorize.contains(Idx)))) continue; // Try to find vectorizable sequences and transform them into a series of // insertvector instructions. @@ -8403,6 +9074,8 @@ void BoUpSLP::transformNodes() { if (PrevSize + 1 == VectorizableTree.size() && VectorizableTree[PrevSize]->isGather()) { VectorizableTree.pop_back(); + LoadEntriesToVectorize.remove_if( + [&](unsigned Idx) { return Idx == PrevSize; }); continue; } E.CombinedEntriesWithIndices.emplace_back(PrevSize, Cnt); @@ -8492,6 +9165,19 @@ void BoUpSLP::transformNodes() { break; } } + // A list of loads to be gathered during the vectorization process. We can + // try to vectorize them at the end, if profitable. + SmallVector>> GatheredLoads; + + for (std::unique_ptr &TE : VectorizableTree) { + TreeEntry &E = *TE; + if (E.isGather() && !isSplat(E.Scalars)) + gatherPossiblyVectorizableLoads(*this, E.Scalars, *DL, *SE, *TTI, + GatheredLoads); + } + // Try to vectorize gathered loads if this is not just a gather of loads. + if (!GatheredLoads.empty()) + tryToVectorizeGatheredLoads(GatheredLoads); } /// Merges shuffle masks and emits final shuffle instruction, if required. It @@ -8898,6 +9584,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Idx = EMask[Idx]; } CommonVF = E->Scalars.size(); + } else if (std::optional Factor = E->getInterleaveFactor(); + Factor && E->Scalars.size() != Mask.size() && + ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask, + *Factor)) { + // Deinterleaved nodes are free. + std::iota(CommonMask.begin(), CommonMask.end(), 0); } ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF); V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); @@ -9543,7 +10235,9 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, InstructionCost VecCost = VectorCost(CommonCost); // Check if the current node must be resized, if the parent node is not // resized. - if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0) { + if (!UnaryInstruction::isCast(E->getOpcode()) && E->Idx != 0 && + (E->getOpcode() != Instruction::Load || + !E->UserTreeIndices.empty())) { const EdgeInfo &EI = E->UserTreeIndices.front(); if ((EI.UserTE->getOpcode() != Instruction::Select || EI.EdgeIdx != 0) && @@ -9982,7 +10676,7 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, auto *LI0 = cast(VL0); auto GetVectorCost = [&](InstructionCost CommonCost) { InstructionCost VecLdCost; - if (E->State == TreeEntry::Vectorize) { + if (E->State == TreeEntry::Vectorize && !E->getInterleaveFactor()) { VecLdCost = TTI->getMemoryOpCost( Instruction::Load, VecTy, LI0->getAlign(), LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); @@ -9992,6 +10686,11 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, VecLdCost = TTI->getStridedMemoryOpCost( Instruction::Load, VecTy, LI0->getPointerOperand(), /*VariableMask=*/false, CommonAlignment, CostKind); + } else if (std::optional Factor = E->getInterleaveFactor(); + E->State == TreeEntry::Vectorize && Factor.value_or(0) > 0) { + VecLdCost = TTI->getInterleavedMemoryOpCost( + Instruction::Load, VecTy, *Factor, std::nullopt, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind); } else { assert(E->State == TreeEntry::ScatterVectorize && "Unknown EntryState"); Align CommonAlignment = @@ -10223,8 +10922,8 @@ bool BoUpSLP::isFullyVectorizableTinyTree(bool ForReduction) const { ((TE->getOpcode() == Instruction::ExtractElement || all_of(TE->Scalars, IsaPred)) && isFixedVectorShuffle(TE->Scalars, Mask)) || - (TE->isGather() && TE->getOpcode() == Instruction::Load && - !TE->isAltShuffle())); + (TE->getOpcode() == Instruction::Load && !TE->isAltShuffle()) || + any_of(TE->Scalars, IsaPred)); }; // We only handle trees of heights 1 and 2. @@ -10689,6 +11388,11 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { } } + // Exclude cost of gather loads nodes which are not used. These nodes were + // built as part of the final attempt to vectorize gathered loads. + assert((!TE.isGather() || TE.Idx == 0 || !TE.UserTreeIndices.empty()) && + "Expected gather nodes with users only."); + InstructionCost C = getEntryCost(&TE, VectorizedVals, CheckedExtracts); Cost += C; LLVM_DEBUG(dbgs() << "SLP: Adding cost " << C << " for bundle " @@ -10896,7 +11600,9 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals) { if (IsProfitablePHIUser) { KeepScalar = true; } else if (KeepScalar && ScalarCost != TTI::TCC_Free && - ExtraCost - ScalarCost <= TTI::TCC_Basic) { + ExtraCost - ScalarCost <= TTI::TCC_Basic && + (GatheredLoadsEntriesFirst == NoGatheredLoads || + Entry->Idx < GatheredLoadsEntriesFirst)) { unsigned ScalarUsesCount = count_if(Entry->Scalars, [&](Value *V) { return ValueToExtUses->contains(V); }); @@ -11220,7 +11926,9 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( Entries.clear(); // TODO: currently checking only for Scalars in the tree entry, need to count // reused elements too for better cost estimation. - const EdgeInfo &TEUseEI = TE->UserTreeIndices.front(); + const EdgeInfo &TEUseEI = TE == VectorizableTree.front().get() + ? EdgeInfo(const_cast(TE), 0) + : TE->UserTreeIndices.front(); const Instruction *TEInsertPt = &getLastInstructionInBundle(TEUseEI.UserTE); const BasicBlock *TEInsertBlock = nullptr; // Main node of PHI entries keeps the correct order of operands/incoming @@ -11315,7 +12023,7 @@ BoUpSLP::isGatherShuffledSingleRegisterEntry( VToTEs.insert(TEPtr); } if (const TreeEntry *VTE = getTreeEntry(V)) { - if (ForOrder) { + if (ForOrder && VTE->Idx < GatheredLoadsEntriesFirst) { if (VTE->State != TreeEntry::Vectorize) { auto It = MultiNodeScalars.find(V); if (It == MultiNodeScalars.end()) @@ -11593,13 +12301,19 @@ BoUpSLP::isGatherShuffledEntry( "Expected positive number of registers."); Entries.clear(); // No need to check for the topmost gather node. - if (TE == VectorizableTree.front().get()) + if (TE == VectorizableTree.front().get() && + (GatheredLoadsEntriesFirst == NoGatheredLoads || + none_of(ArrayRef(VectorizableTree).drop_front(), + [](const std::unique_ptr &TE) { + return !TE->isGather(); + }))) return {}; // FIXME: Gathering for non-power-of-2 nodes not implemented yet. if (TE->isNonPowOf2Vec()) return {}; Mask.assign(VL.size(), PoisonMaskElem); - assert(TE->UserTreeIndices.size() == 1 && + assert((TE->UserTreeIndices.size() == 1 || + TE == VectorizableTree.front().get()) && "Expected only single user of the gather node."); assert(VL.size() % NumParts == 0 && "Number of scalars must be divisible by NumParts."); @@ -11718,17 +12432,23 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return *Res.second; // Get the basic block this bundle is in. All instructions in the bundle // should be in this block (except for extractelement-like instructions with - // constant indeces). + // constant indecies or gathered loads). auto *Front = E->getMainOp(); auto *BB = Front->getParent(); - assert(llvm::all_of(E->Scalars, [=](Value *V) -> bool { - if (E->getOpcode() == Instruction::GetElementPtr && - !isa(V)) - return true; - auto *I = cast(V); - return !E->isOpcodeOrAlt(I) || I->getParent() == BB || - isVectorLikeInstWithConstOps(I); - })); + assert(((GatheredLoadsEntriesFirst != NoGatheredLoads && + E->getOpcode() == Instruction::Load && E->isGather() && + E->Idx < GatheredLoadsEntriesFirst) || + all_of(E->Scalars, + [=](Value *V) -> bool { + if (E->getOpcode() == Instruction::GetElementPtr && + !isa(V)) + return true; + auto *I = cast(V); + return !E->isOpcodeOrAlt(I) || I->getParent() == BB || + isVectorLikeInstWithConstOps(I); + })) && + "Expected gathered loads or GEPs or instructions from same basic " + "block."); auto FindLastInst = [&]() { Instruction *LastInst = Front; @@ -11744,7 +12464,10 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { assert(((E->getOpcode() == Instruction::GetElementPtr && !isa(I)) || (isVectorLikeInstWithConstOps(LastInst) && - isVectorLikeInstWithConstOps(I))) && + isVectorLikeInstWithConstOps(I)) || + (GatheredLoadsEntriesFirst != NoGatheredLoads && + E->getOpcode() == Instruction::Load && E->isGather() && + E->Idx < GatheredLoadsEntriesFirst)) && "Expected vector-like or non-GEP in GEP node insts only."); if (!DT->isReachableFromEntry(LastInst->getParent())) { LastInst = I; @@ -11801,6 +12524,13 @@ Instruction &BoUpSLP::getLastInstructionInBundle(const TreeEntry *E) { return FirstInst; }; + // Set insertpoint for gathered loads to the very first load. + if (GatheredLoadsEntriesFirst != NoGatheredLoads && + E->Idx >= GatheredLoadsEntriesFirst && !E->isGather() && + E->getOpcode() == Instruction::Load) { + Res.second = FindFirstInst(); + return *Res.second; + } // Set the insert point to the beginning of the basic block if the entry // should not be scheduled. if (doesNotNeedToSchedule(E->Scalars) || @@ -12777,6 +13507,12 @@ ResTy BoUpSLP::processBuildVector(const TreeEntry *E, Type *ScalarTy, } // Gather extracts after we check for full matched gathers only. if (!ExtractShuffles.empty() || E->getOpcode() != Instruction::Load || + ((E->getOpcode() == Instruction::Load || + any_of(E->Scalars, IsaPred)) && + any_of(E->Scalars, + [this](Value *V) { + return isa(V) && getTreeEntry(V); + })) || E->isAltShuffle() || all_of(E->Scalars, [this](Value *V) { return getTreeEntry(V); }) || isSplat(E->Scalars) || @@ -14135,6 +14871,18 @@ Value *BoUpSLP::vectorizeTree( else Builder.SetInsertPoint(&F->getEntryBlock(), F->getEntryBlock().begin()); + // Emit gathered loads first to emit better code for the users of those + // gathered loads. + for (const std::unique_ptr &TE : VectorizableTree) { + if (GatheredLoadsEntriesFirst != NoGatheredLoads && + TE->Idx >= GatheredLoadsEntriesFirst && + (!TE->isGather() || !TE->UserTreeIndices.empty())) { + assert((!TE->UserTreeIndices.empty() || + (TE->getOpcode() == Instruction::Load && !TE->isGather())) && + "Expected gathered load node."); + (void)vectorizeTree(TE.get(), /*PostponedPHIs=*/false); + } + } // Postpone emission of PHIs operands to avoid cyclic dependencies issues. (void)vectorizeTree(VectorizableTree[0].get(), /*PostponedPHIs=*/true); for (const std::unique_ptr &TE : VectorizableTree) @@ -14711,10 +15459,15 @@ Value *BoUpSLP::vectorizeTree( if (IE->Idx != 0 && !(VectorizableTree.front()->isGather() && isa(I) && !IE->UserTreeIndices.empty() && - any_of(IE->UserTreeIndices, [&](const EdgeInfo &EI) { - return EI.UserTE == VectorizableTree.front().get() && - EI.EdgeIdx == UINT_MAX; - }))) + any_of(IE->UserTreeIndices, + [&](const EdgeInfo &EI) { + return EI.UserTE == VectorizableTree.front().get() && + EI.EdgeIdx == UINT_MAX; + })) && + !(GatheredLoadsEntriesFirst != NoGatheredLoads && + IE->Idx >= GatheredLoadsEntriesFirst && + VectorizableTree.front()->isGather() && + is_contained(VectorizableTree.front()->Scalars, I))) continue; SmallVector LogicalOpSelects; I->replaceUsesWithIf(PoisonValue::get(I->getType()), [&](Use &U) { diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll index ff1d6253bec92..fffa626cae0dd 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/tsc-s116.ll @@ -23,14 +23,12 @@ define void @s116_modified(ptr %a) { ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[GEP1]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <2 x float>, ptr [[GEP3]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[LD0]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[TMP3]], <4 x float> [[TMP4]], <4 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP5]], <2 x float> [[TMP2]], i64 2) -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP7]], <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP9]] -; CHECK-NEXT: store <4 x float> [[TMP10]], ptr [[A]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> [[TMP2]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <4 x float> [[TMP6]], [[TMP7]] +; CHECK-NEXT: store <4 x float> [[TMP8]], ptr [[A]], align 4 ; CHECK-NEXT: ret void ; %gep1 = getelementptr inbounds float, ptr %a, i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll index 67746f2cbf5d2..d4dbb8bbfaf0d 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vec3-calls.ll @@ -1,8 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s -; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=CHECK %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2 -mtriple=arm64-apple-ios -S %s | FileCheck --check-prefixes=NON-POWER-OF-2 %s +; RUN: opt -passes=slp-vectorizer -slp-vectorize-non-power-of-2=false -mtriple=arm64-apple-ios -S %s | FileCheck %s define void @vec3_vectorize_call(ptr %Colour, float %0) { +; NON-POWER-OF-2-LABEL: @vec3_vectorize_call( +; NON-POWER-OF-2-NEXT: entry: +; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 +; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 +; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> +; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> +; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; NON-POWER-OF-2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 +; NON-POWER-OF-2-NEXT: ret void +; ; CHECK-LABEL: @vec3_vectorize_call( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 @@ -28,6 +38,19 @@ entry: } define void @vec3_fmuladd_64(ptr %Colour, double %0) { +; NON-POWER-OF-2-LABEL: @vec3_fmuladd_64( +; NON-POWER-OF-2-NEXT: entry: +; NON-POWER-OF-2-NEXT: [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2 +; NON-POWER-OF-2-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[TMP0:%.*]], i32 0 +; NON-POWER-OF-2-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> zeroinitializer +; NON-POWER-OF-2-NEXT: [[TMP3:%.*]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP2]], <2 x double> zeroinitializer, <2 x double> zeroinitializer) +; NON-POWER-OF-2-NEXT: [[TMP4:%.*]] = fptrunc <2 x double> [[TMP3]] to <2 x float> +; NON-POWER-OF-2-NEXT: store <2 x float> [[TMP4]], ptr [[COLOUR]], align 4 +; NON-POWER-OF-2-NEXT: [[TMP5:%.*]] = call double @llvm.fmuladd.f64(double [[TMP0]], double 0.000000e+00, double 0.000000e+00) +; NON-POWER-OF-2-NEXT: [[CONV82:%.*]] = fptrunc double [[TMP5]] to float +; NON-POWER-OF-2-NEXT: store float [[CONV82]], ptr [[ARRAYIDX80]], align 4 +; NON-POWER-OF-2-NEXT: ret void +; ; CHECK-LABEL: @vec3_fmuladd_64( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX80:%.*]] = getelementptr float, ptr [[COLOUR:%.*]], i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll index f04c359b432b5..9c086abe216c0 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/vectorizable-selects-uniform-cmps.ll @@ -245,34 +245,24 @@ define void @select_uniform_ugt_16xi8(ptr %ptr, i8 %x) { ; CHECK-NEXT: [[L_8:%.*]] = load i8, ptr [[GEP_8]], align 1 ; CHECK-NEXT: [[CMP_8:%.*]] = icmp ugt i8 [[L_8]], -1 ; CHECK-NEXT: [[GEP_9:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 9 -; CHECK-NEXT: [[L_9:%.*]] = load i8, ptr [[GEP_9]], align 1 -; CHECK-NEXT: [[GEP_10:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 10 -; CHECK-NEXT: [[L_10:%.*]] = load i8, ptr [[GEP_10]], align 1 ; CHECK-NEXT: [[GEP_11:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 11 ; CHECK-NEXT: [[L_11:%.*]] = load i8, ptr [[GEP_11]], align 1 ; CHECK-NEXT: [[GEP_12:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i8 12 ; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[PTR]], align 1 ; CHECK-NEXT: [[TMP1:%.*]] = extractelement <8 x i8> [[TMP0]], i32 0 ; CHECK-NEXT: [[S_8:%.*]] = select i1 [[CMP_8]], i8 [[TMP1]], i8 [[X:%.*]] -; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[L_9]], i32 9 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L_10]], i32 10 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[GEP_9]], align 1 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i8>, ptr [[GEP_12]], align 1 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i8> [[TMP2]], <2 x i8> poison, <8 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP4]], <16 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L_11]], i32 11 ; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> [[TMP6]], <8 x i8> [[TMP0]], i64 0) -; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP2]], i64 12) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP7]], <4 x i8> [[TMP3]], i64 12) ; CHECK-NEXT: [[TMP9:%.*]] = icmp ugt <16 x i8> [[TMP8]], -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i8> [[TMP2]], <4 x i8> poison, <8 x i32> -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i8> [[TMP0]], <8 x i8> [[TMP11]], <16 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L_9]], i32 9 -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L_10]], i32 10 -; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[L_11]], i32 11 -; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <16 x i8> [[TMP15]], <16 x i8> poison, <16 x i32> -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i8> [[TMP17]], <16 x i8> poison, <16 x i32> zeroinitializer -; CHECK-NEXT: [[TMP19:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP16]], <16 x i8> [[TMP18]] -; CHECK-NEXT: store <16 x i8> [[TMP19]], ptr [[PTR]], align 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> poison, i8 [[X]], i32 0 +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <16 x i8> [[TMP10]], <16 x i8> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP12:%.*]] = select <16 x i1> [[TMP9]], <16 x i8> [[TMP8]], <16 x i8> [[TMP11]] +; CHECK-NEXT: store <16 x i8> [[TMP12]], ptr [[PTR]], align 2 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 36681ecea4f50..d222d87e01b7a 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -11,12 +11,9 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 -; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32 +; CHECK-NEXT: [[TMP2:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 +; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP2]] to i32 ; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 @@ -24,145 +21,154 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 ; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; CHECK-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 1 -; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 5 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 5 ; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 -; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP14]] to i32 +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 +; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP3]] to i32 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5 -; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5 -; CHECK-NEXT: [[TMP4:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP16]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32> -; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP28]] to <2 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = zext <2 x i8> [[TMP5]] to <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP6]], [[TMP9]] +; CHECK-NEXT: [[TMP18:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP18]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i32> +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP13]], [[TMP16]] ; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], -; CHECK-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP25]], [[TMP8]] -; CHECK-NEXT: [[TMP29:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP17:%.*]] = zext <2 x i8> [[TMP29]] to <2 x i32> -; CHECK-NEXT: [[TMP18:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32> -; CHECK-NEXT: [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP22]], [[TMP30]] +; CHECK-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP25]], [[TMP10]] +; CHECK-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP17:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> +; CHECK-NEXT: [[TMP29:%.*]] = sub <2 x i32> [[TMP17]], [[TMP23]] +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP18]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP30]] to <2 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP14]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32> +; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP26]], [[TMP28]] ; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], -; CHECK-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP37]], [[TMP20]] -; CHECK-NEXT: [[TMP26:%.*]] = add <2 x i32> [[TMP27]], [[TMP15]] -; CHECK-NEXT: [[TMP38:%.*]] = sub <2 x i32> [[TMP15]], [[TMP27]] -; CHECK-NEXT: [[ADD44_2:%.*]] = extractelement <2 x i32> [[TMP26]], i32 0 -; CHECK-NEXT: [[CONV:%.*]] = extractelement <2 x i32> [[TMP26]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV]], [[ADD44_2]] -; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[CONV]] -; CHECK-NEXT: [[SUB45_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 -; CHECK-NEXT: [[SUB47_2:%.*]] = extractelement <2 x i32> [[TMP38]], i32 1 -; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]] -; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1 -; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1 +; CHECK-NEXT: [[TMP31:%.*]] = add <2 x i32> [[TMP37]], [[TMP29]] +; CHECK-NEXT: [[TMP33:%.*]] = add <2 x i32> [[TMP31]], [[TMP19]] +; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP19]], [[TMP31]] +; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0 +; CHECK-NEXT: [[TMP59:%.*]] = extractelement <2 x i32> [[TMP33]], i32 1 +; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP59]], [[TMP34]] +; CHECK-NEXT: [[ARRAYIDX5_4:%.*]] = getelementptr i8, ptr null, i64 4 +; CHECK-NEXT: [[TMP41:%.*]] = load <2 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = load <2 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP42]], [[TMP39]] +; CHECK-NEXT: [[TMP46:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32> +; CHECK-NEXT: [[TMP50:%.*]] = shufflevector <2 x i32> [[TMP49]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_4]], align 1 +; CHECK-NEXT: [[TMP48:%.*]] = zext <2 x i8> [[TMP45]] to <2 x i32> +; CHECK-NEXT: [[TMP51:%.*]] = sub <2 x i32> [[TMP50]], [[TMP48]] +; CHECK-NEXT: [[TMP52:%.*]] = shl <2 x i32> [[TMP51]], +; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP52]], [[TMP40]] +; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 ; CHECK-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5 +; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 ; CHECK-NEXT: [[TMP43:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP33]], [[TMP39]] -; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP42:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP58:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP58]] to <2 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP42]], [[TMP59]] -; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], -; CHECK-NEXT: [[TMP62:%.*]] = add <2 x i32> [[TMP46]], [[TMP40]] -; CHECK-NEXT: [[TMP48:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32> -; CHECK-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]] +; CHECK-NEXT: [[TMP57:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 +; CHECK-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP57]] to <2 x i32> +; CHECK-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 +; CHECK-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> +; CHECK-NEXT: [[TMP55:%.*]] = sub <2 x i32> [[TMP53]], [[TMP56]] ; CHECK-NEXT: [[TMP64:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0 ; CHECK-NEXT: [[TMP65:%.*]] = insertelement <2 x i8> [[TMP64]], i8 [[TMP43]], i32 1 -; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> -; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP55]], [[TMP57]] +; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 +; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32> +; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP58]], [[TMP61]] ; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], -; CHECK-NEXT: [[TMP60:%.*]] = add <2 x i32> [[TMP70]], [[TMP52]] -; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP60]], [[TMP62]] -; CHECK-NEXT: [[TMP47:%.*]] = sub <2 x i32> [[TMP62]], [[TMP60]] -; CHECK-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 +; CHECK-NEXT: [[TMP47:%.*]] = add <2 x i32> [[TMP70]], [[TMP55]] +; CHECK-NEXT: [[TMP67:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 ; CHECK-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 -; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]] -; CHECK-NEXT: [[SUB51_3:%.*]] = sub i32 [[TMP74]], [[TMP75]] -; CHECK-NEXT: [[TMP61:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0 +; CHECK-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP75]], [[TMP67]] +; CHECK-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP67]], [[TMP75]] +; CHECK-NEXT: [[TMP68:%.*]] = extractelement <2 x i32> [[TMP47]], i32 0 ; CHECK-NEXT: [[TMP79:%.*]] = extractelement <2 x i32> [[TMP47]], i32 1 -; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP79]], [[TMP61]] -; CHECK-NEXT: [[SUB59_3:%.*]] = sub i32 [[TMP61]], [[TMP79]] +; CHECK-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP79]], [[TMP68]] +; CHECK-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP68]], [[TMP79]] +; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]] +; CHECK-NEXT: [[TMP81:%.*]] = shufflevector <2 x i32> [[TMP33]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP89:%.*]] = insertelement <2 x i32> [[TMP81]], i32 [[ADD44_3]], i32 0 +; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP33]], i32 [[ADD46_3]], i32 0 +; CHECK-NEXT: [[TMP94:%.*]] = sub <2 x i32> [[TMP89]], [[TMP71]] +; CHECK-NEXT: [[TMP73:%.*]] = insertelement <2 x i32> [[TMP35]], i32 [[SUB47_3]], i32 0 +; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP35]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP99:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0 +; CHECK-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP73]], [[TMP99]] +; CHECK-NEXT: [[TMP77:%.*]] = sub <2 x i32> [[TMP99]], [[TMP73]] ; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] ; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] -; CHECK-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0 +; CHECK-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0 ; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP63]], 15 ; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 ; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[CONV]], 15 +; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP59]], 15 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 ; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] -; CHECK-NEXT: [[TMP107:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP107]], 15 +; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0 +; CHECK-NEXT: [[TMP103:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1 +; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[TMP78]], [[TMP103]] +; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[TMP103]], [[TMP78]] +; CHECK-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; CHECK-NEXT: [[SHR_I49_5:%.*]] = lshr i32 [[TMP80]], 15 ; CHECK-NEXT: [[AND_I50_5:%.*]] = and i32 [[SHR_I49_5]], 65537 ; CHECK-NEXT: [[MUL_I51_5:%.*]] = mul i32 [[AND_I50_5]], 65535 -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]] -; CHECK-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]] +; CHECK-NEXT: [[TMP82:%.*]] = extractelement <2 x i32> [[TMP94]], i32 0 +; CHECK-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP94]], i32 1 +; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP82]], [[TMP83]] +; CHECK-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP83]], [[TMP82]] ; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV_1]], 15 ; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 ; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 -; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]] +; CHECK-NEXT: [[TMP84:%.*]] = extractelement <2 x i32> [[TMP77]], i32 0 +; CHECK-NEXT: [[TMP85:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1 +; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[TMP84]], [[TMP85]] +; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP85]], [[TMP84]] ; CHECK-NEXT: [[SHR_I49_6:%.*]] = lshr i32 [[CONV1]], 15 ; CHECK-NEXT: [[AND_I50_6:%.*]] = and i32 [[SHR_I49_6]], 65537 ; CHECK-NEXT: [[MUL_I51_6:%.*]] = mul i32 [[AND_I50_6]], 65535 ; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 ; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP77:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; CHECK-NEXT: [[TMP73:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP73]] to <2 x i32> -; CHECK-NEXT: [[TMP85:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP85]] to <2 x i32> -; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP78]], [[TMP76]] +; CHECK-NEXT: [[TMP112:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <4 x i8> [[TMP112]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP113]] to <2 x i32> +; CHECK-NEXT: [[TMP90:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP90]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32> +; CHECK-NEXT: [[TMP116:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP116]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32> +; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP111]], [[TMP115]] ; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], -; CHECK-NEXT: [[TMP89:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP89]] to <2 x i32> -; CHECK-NEXT: [[TMP81:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP82:%.*]] = zext <2 x i8> [[TMP81]] to <2 x i32> -; CHECK-NEXT: [[TMP83:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP84:%.*]] = zext <2 x i8> [[TMP83]] to <2 x i32> -; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]] +; CHECK-NEXT: [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP112]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP119:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32> +; CHECK-NEXT: [[TMP100:%.*]] = shufflevector <4 x i8> [[TMP90]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP101:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32> +; CHECK-NEXT: [[TMP120:%.*]] = shufflevector <4 x i8> [[TMP116]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32> +; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP101]], [[TMP155]] ; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], ; CHECK-NEXT: [[TMP97:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 -; CHECK-NEXT: [[TMP90:%.*]] = sub <2 x i32> [[TMP97]], [[TMP80]] -; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP90]] +; CHECK-NEXT: [[TMP107:%.*]] = sub <2 x i32> [[TMP97]], [[TMP119]] +; CHECK-NEXT: [[TMP105:%.*]] = add <2 x i32> [[TMP96]], [[TMP107]] ; CHECK-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP98:%.*]] = sub <2 x i32> [[TMP86]], [[TMP77]] -; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP98]] +; CHECK-NEXT: [[TMP156:%.*]] = sub <2 x i32> [[TMP86]], [[TMP109]] +; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP156]] ; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> ; CHECK-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]] ; CHECK-NEXT: [[TMP91:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]] @@ -170,10 +176,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1 ; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP108]], [[TMP238]] ; CHECK-NEXT: [[SUB51:%.*]] = sub i32 [[TMP238]], [[TMP108]] -; CHECK-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0 +; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0 ; CHECK-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1 -; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP94]] -; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP94]], [[SUB47]] +; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[SUB47]], [[TMP160]] +; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP160]], [[SUB47]] ; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP108]], 15 ; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 ; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 @@ -182,38 +188,41 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535 ; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 ; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> -; CHECK-NEXT: [[TMP116:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32> -; CHECK-NEXT: [[TMP128:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32> -; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP118]], [[TMP155]] +; CHECK-NEXT: [[TMP157:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <4 x i8> [[TMP157]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP123:%.*]] = zext <2 x i8> [[TMP122]] to <2 x i32> +; CHECK-NEXT: [[TMP158:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP159]] to <2 x i32> +; CHECK-NEXT: [[TMP161:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP128:%.*]] = shufflevector <4 x i8> [[TMP161]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP162:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32> +; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP126]], [[TMP162]] ; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], -; CHECK-NEXT: [[TMP156:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP156]] to <2 x i32> -; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> -; CHECK-NEXT: [[TMP114:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32> -; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP113]], [[TMP115]] +; CHECK-NEXT: [[TMP163:%.*]] = shufflevector <4 x i8> [[TMP157]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP164:%.*]] = zext <2 x i8> [[TMP163]] to <2 x i32> +; CHECK-NEXT: [[TMP165:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP166:%.*]] = zext <2 x i8> [[TMP165]] to <2 x i32> +; CHECK-NEXT: [[TMP167:%.*]] = shufflevector <4 x i8> [[TMP161]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP169:%.*]] = zext <2 x i8> [[TMP167]] to <2 x i32> +; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP166]], [[TMP169]] ; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], ; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP137]], [[TMP111]] -; CHECK-NEXT: [[TMP120:%.*]] = add <2 x i32> [[TMP136]], [[TMP119]] +; CHECK-NEXT: [[TMP170:%.*]] = sub <2 x i32> [[TMP137]], [[TMP164]] +; CHECK-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP136]], [[TMP170]] ; CHECK-NEXT: [[TMP117:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP122:%.*]] = sub <2 x i32> [[TMP117]], [[TMP103]] -; CHECK-NEXT: [[TMP123:%.*]] = add <2 x i32> [[TMP125]], [[TMP122]] -; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP120]], [[TMP123]] -; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP120]] -; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 +; CHECK-NEXT: [[TMP171:%.*]] = sub <2 x i32> [[TMP117]], [[TMP123]] +; CHECK-NEXT: [[TMP145:%.*]] = add <2 x i32> [[TMP125]], [[TMP171]] +; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP142]], [[TMP145]] +; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP145]], [[TMP142]] +; CHECK-NEXT: [[TMP172:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 ; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP145]] -; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP145]], [[TMP146]] -; CHECK-NEXT: [[TMP126:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0 +; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP146]], [[TMP172]] +; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP172]], [[TMP146]] +; CHECK-NEXT: [[TMP173:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0 ; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1 -; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP127]], [[TMP126]] -; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP126]], [[TMP127]] +; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP127]], [[TMP173]] +; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP173]], [[TMP127]] ; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP146]], 15 ; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 ; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 @@ -229,7 +238,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I51_3]], [[ADD103]] ; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]] ; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV]] +; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP59]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]] ; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP146]] ; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]] @@ -242,7 +251,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] ; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] ; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_5]], [[ADD105_1]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP107]] +; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP80]] ; CHECK-NEXT: [[TMP129:%.*]] = shufflevector <2 x i32> [[TMP17]], <2 x i32> [[TMP121]], <2 x i32> ; CHECK-NEXT: [[TMP130:%.*]] = lshr <2 x i32> [[TMP129]], ; CHECK-NEXT: [[TMP131:%.*]] = and <2 x i32> [[TMP130]], @@ -259,8 +268,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]] ; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]] ; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] -; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0 -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP142]] +; CHECK-NEXT: [[TMP168:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0 +; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP168]] ; CHECK-NEXT: [[TMP154:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1 ; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP154]] ; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] @@ -285,8 +294,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] ; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP238]] ; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; CHECK-NEXT: [[TMP237:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP237]] +; CHECK-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP213]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP179]] ; CHECK-NEXT: [[TMP218:%.*]] = extractelement <2 x i32> [[TMP213]], i32 1 ; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP218]] ; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] @@ -314,8 +323,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] ; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] ; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] -; CHECK-NEXT: [[TMP235:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0 -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP235]] +; CHECK-NEXT: [[TMP193:%.*]] = extractelement <2 x i32> [[TMP234]], i32 0 +; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP193]] ; CHECK-NEXT: [[TMP236:%.*]] = extractelement <2 x i32> [[TMP234]], i32 1 ; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP236]] ; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] @@ -329,9 +338,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 ; THR15-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; THR15-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1 -; THR15-NEXT: [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5 -; THR15-NEXT: [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5 ; THR15-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 ; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 ; THR15-NEXT: [[CONV33:%.*]] = zext i8 [[TMP1]] to i32 @@ -342,9 +348,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 ; THR15-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; THR15-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 1 -; THR15-NEXT: [[ARRAYIDX13_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 5 -; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 5 ; THR15-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 ; THR15-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 ; THR15-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP3]] to i32 @@ -352,139 +355,147 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; THR15-NEXT: [[TMP4:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP4]] to <2 x i32> -; THR15-NEXT: [[TMP6:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1 -; THR15-NEXT: [[TMP7:%.*]] = zext <2 x i8> [[TMP6]] to <2 x i32> -; THR15-NEXT: [[TMP8:%.*]] = sub <2 x i32> [[TMP66]], [[TMP7]] -; THR15-NEXT: [[TMP9:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; THR15-NEXT: [[TMP10:%.*]] = zext <2 x i8> [[TMP9]] to <2 x i32> -; THR15-NEXT: [[TMP11:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; THR15-NEXT: [[TMP12:%.*]] = zext <2 x i8> [[TMP11]] to <2 x i32> -; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP10]], [[TMP12]] +; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; THR15-NEXT: [[TMP17:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP6:%.*]] = zext <2 x i8> [[TMP17]] to <2 x i32> +; THR15-NEXT: [[TMP7:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; THR15-NEXT: [[TMP8:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP9:%.*]] = zext <2 x i8> [[TMP8]] to <2 x i32> +; THR15-NEXT: [[TMP10:%.*]] = sub <2 x i32> [[TMP6]], [[TMP9]] +; THR15-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; THR15-NEXT: [[TMP12:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP18:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i32> +; THR15-NEXT: [[TMP29:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; THR15-NEXT: [[TMP15:%.*]] = shufflevector <4 x i8> [[TMP29]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP16:%.*]] = zext <2 x i8> [[TMP15]] to <2 x i32> +; THR15-NEXT: [[TMP13:%.*]] = sub <2 x i32> [[TMP18]], [[TMP16]] ; THR15-NEXT: [[TMP14:%.*]] = shl <2 x i32> [[TMP13]], -; THR15-NEXT: [[TMP15:%.*]] = add <2 x i32> [[TMP14]], [[TMP8]] -; THR15-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2 -; THR15-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2 -; THR15-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6 -; THR15-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6 -; THR15-NEXT: [[TMP16:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1 -; THR15-NEXT: [[TMP17:%.*]] = zext <2 x i8> [[TMP16]] to <2 x i32> -; THR15-NEXT: [[TMP18:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1 -; THR15-NEXT: [[TMP19:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i32> -; THR15-NEXT: [[TMP20:%.*]] = sub <2 x i32> [[TMP17]], [[TMP19]] -; THR15-NEXT: [[TMP21:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1 -; THR15-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32> -; THR15-NEXT: [[TMP23:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1 -; THR15-NEXT: [[TMP24:%.*]] = zext <2 x i8> [[TMP23]] to <2 x i32> -; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP22]], [[TMP24]] +; THR15-NEXT: [[TMP19:%.*]] = add <2 x i32> [[TMP14]], [[TMP10]] +; THR15-NEXT: [[TMP20:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP20]] to <2 x i32> +; THR15-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP7]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP23:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> +; THR15-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP21]], [[TMP23]] +; THR15-NEXT: [[TMP35:%.*]] = shufflevector <4 x i8> [[TMP11]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP30:%.*]] = zext <2 x i8> [[TMP35]] to <2 x i32> +; THR15-NEXT: [[TMP37:%.*]] = shufflevector <4 x i8> [[TMP29]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP28:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32> +; THR15-NEXT: [[TMP25:%.*]] = sub <2 x i32> [[TMP30]], [[TMP28]] ; THR15-NEXT: [[TMP26:%.*]] = shl <2 x i32> [[TMP25]], -; THR15-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP26]], [[TMP20]] -; THR15-NEXT: [[TMP28:%.*]] = extractelement <2 x i32> [[TMP15]], i32 0 -; THR15-NEXT: [[TMP29:%.*]] = extractelement <2 x i32> [[TMP15]], i32 1 -; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP29]], [[TMP28]] -; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP28]], [[TMP29]] -; THR15-NEXT: [[TMP30:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0 +; THR15-NEXT: [[TMP33:%.*]] = add <2 x i32> [[TMP26]], [[TMP24]] +; THR15-NEXT: [[TMP27:%.*]] = add <2 x i32> [[TMP33]], [[TMP19]] +; THR15-NEXT: [[TMP47:%.*]] = sub <2 x i32> [[TMP19]], [[TMP33]] +; THR15-NEXT: [[TMP36:%.*]] = extractelement <2 x i32> [[TMP27]], i32 0 ; THR15-NEXT: [[TMP31:%.*]] = extractelement <2 x i32> [[TMP27]], i32 1 -; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP31]], [[TMP30]] -; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP30]], [[TMP31]] -; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]] -; THR15-NEXT: [[SUB51_2:%.*]] = sub i32 [[ADD44_2]], [[ADD46_2]] -; THR15-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]] -; THR15-NEXT: [[SUB59_2:%.*]] = sub i32 [[SUB45_2]], [[SUB47_2]] -; THR15-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 -; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 +; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP31]], [[TMP36]] +; THR15-NEXT: [[ARRAYIDX5_4:%.*]] = getelementptr i8, ptr null, i64 4 ; THR15-NEXT: [[TMP32:%.*]] = load <2 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP33:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> +; THR15-NEXT: [[TMP38:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> ; THR15-NEXT: [[TMP34:%.*]] = load <2 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP35:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32> -; THR15-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP33]], [[TMP35]] -; THR15-NEXT: [[TMP37:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP38:%.*]] = zext <2 x i8> [[TMP37]] to <2 x i32> -; THR15-NEXT: [[TMP39:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; THR15-NEXT: [[TMP40:%.*]] = zext <2 x i8> [[TMP39]] to <2 x i32> -; THR15-NEXT: [[TMP41:%.*]] = sub <2 x i32> [[TMP38]], [[TMP40]] +; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP34]] to <2 x i32> +; THR15-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP38]], [[TMP39]] +; THR15-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP64:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32> +; THR15-NEXT: [[TMP66:%.*]] = shufflevector <2 x i32> [[TMP64]], <2 x i32> poison, <2 x i32> +; THR15-NEXT: [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_4]], align 1 +; THR15-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32> +; THR15-NEXT: [[TMP41:%.*]] = sub <2 x i32> [[TMP66]], [[TMP50]] ; THR15-NEXT: [[TMP42:%.*]] = shl <2 x i32> [[TMP41]], -; THR15-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP36]] +; THR15-NEXT: [[TMP43:%.*]] = add <2 x i32> [[TMP42]], [[TMP40]] ; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 ; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 ; THR15-NEXT: [[TMP44:%.*]] = load i8, ptr null, align 1 ; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 ; THR15-NEXT: [[TMP45:%.*]] = load i8, ptr null, align 1 ; THR15-NEXT: [[TMP46:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 -; THR15-NEXT: [[TMP47:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32> +; THR15-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP46]] to <2 x i32> ; THR15-NEXT: [[TMP48:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 -; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32> -; THR15-NEXT: [[TMP50:%.*]] = sub <2 x i32> [[TMP47]], [[TMP49]] +; THR15-NEXT: [[TMP59:%.*]] = zext <2 x i8> [[TMP48]] to <2 x i32> +; THR15-NEXT: [[TMP61:%.*]] = sub <2 x i32> [[TMP53]], [[TMP59]] ; THR15-NEXT: [[TMP51:%.*]] = insertelement <2 x i8> poison, i8 [[TMP44]], i32 0 ; THR15-NEXT: [[TMP52:%.*]] = insertelement <2 x i8> [[TMP51]], i8 [[TMP45]], i32 1 -; THR15-NEXT: [[TMP53:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32> +; THR15-NEXT: [[TMP67:%.*]] = zext <2 x i8> [[TMP52]] to <2 x i32> ; THR15-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; THR15-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> -; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP53]], [[TMP55]] +; THR15-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> +; THR15-NEXT: [[TMP56:%.*]] = sub <2 x i32> [[TMP67]], [[TMP94]] ; THR15-NEXT: [[TMP57:%.*]] = shl <2 x i32> [[TMP56]], -; THR15-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP50]] -; THR15-NEXT: [[TMP59:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0 +; THR15-NEXT: [[TMP58:%.*]] = add <2 x i32> [[TMP57]], [[TMP61]] +; THR15-NEXT: [[TMP108:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0 ; THR15-NEXT: [[TMP60:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1 -; THR15-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP59]] -; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP59]], [[TMP60]] -; THR15-NEXT: [[TMP61:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0 +; THR15-NEXT: [[ADD44_3:%.*]] = add i32 [[TMP60]], [[TMP108]] +; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP108]], [[TMP60]] +; THR15-NEXT: [[TMP109:%.*]] = extractelement <2 x i32> [[TMP58]], i32 0 ; THR15-NEXT: [[TMP62:%.*]] = extractelement <2 x i32> [[TMP58]], i32 1 -; THR15-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP61]] -; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP61]], [[TMP62]] +; THR15-NEXT: [[ADD46_3:%.*]] = add i32 [[TMP62]], [[TMP109]] +; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP109]], [[TMP62]] ; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[ADD46_3]], [[ADD44_3]] -; THR15-NEXT: [[SUB51_3:%.*]] = sub i32 [[ADD44_3]], [[ADD46_3]] -; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] -; THR15-NEXT: [[SUB59_3:%.*]] = sub i32 [[SUB45_3]], [[SUB47_3]] +; THR15-NEXT: [[TMP68:%.*]] = shufflevector <2 x i32> [[TMP27]], <2 x i32> poison, <2 x i32> +; THR15-NEXT: [[TMP69:%.*]] = insertelement <2 x i32> [[TMP68]], i32 [[ADD44_3]], i32 0 +; THR15-NEXT: [[TMP70:%.*]] = insertelement <2 x i32> [[TMP27]], i32 [[ADD46_3]], i32 0 +; THR15-NEXT: [[TMP71:%.*]] = sub <2 x i32> [[TMP69]], [[TMP70]] +; THR15-NEXT: [[TMP104:%.*]] = insertelement <2 x i32> [[TMP47]], i32 [[SUB47_3]], i32 0 +; THR15-NEXT: [[TMP105:%.*]] = shufflevector <2 x i32> [[TMP47]], <2 x i32> poison, <2 x i32> +; THR15-NEXT: [[TMP116:%.*]] = insertelement <2 x i32> [[TMP105]], i32 [[SUB45_3]], i32 0 +; THR15-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP104]], [[TMP116]] +; THR15-NEXT: [[TMP76:%.*]] = sub <2 x i32> [[TMP116]], [[TMP104]] ; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] ; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] -; THR15-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP33]], i32 0 +; THR15-NEXT: [[TMP63:%.*]] = extractelement <2 x i32> [[TMP38]], i32 0 ; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[TMP63]], 15 ; THR15-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 ; THR15-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15 +; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP31]], 15 ; THR15-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 ; THR15-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] -; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] -; THR15-NEXT: [[TMP64:%.*]] = extractelement <2 x i32> [[TMP66]], i32 0 -; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP64]], 15 +; THR15-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0 +; THR15-NEXT: [[TMP79:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1 +; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP78]], [[TMP79]] +; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[TMP79]], [[TMP78]] +; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP6]], i32 0 +; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[TMP80]], 15 ; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 ; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]] -; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[SUB51_2]], [[SUB51_3]] +; THR15-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP71]], i32 0 +; THR15-NEXT: [[TMP82:%.*]] = extractelement <2 x i32> [[TMP71]], i32 1 +; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP81]], [[TMP82]] +; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP82]], [[TMP81]] ; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV_1]], 15 ; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 ; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[SUB59_3]], [[SUB59_2]] -; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[SUB59_2]], [[SUB59_3]] +; THR15-NEXT: [[TMP83:%.*]] = extractelement <2 x i32> [[TMP76]], i32 0 +; THR15-NEXT: [[TMP112:%.*]] = extractelement <2 x i32> [[TMP76]], i32 1 +; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP83]], [[TMP112]] +; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP112]], [[TMP83]] ; THR15-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV]], 15 ; THR15-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 ; THR15-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 ; THR15-NEXT: [[TMP65:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 ; THR15-NEXT: [[TMP74:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> -; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; THR15-NEXT: [[TMP69:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP70:%.*]] = zext <2 x i8> [[TMP69]] to <2 x i32> -; THR15-NEXT: [[TMP71:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP81:%.*]] = zext <2 x i8> [[TMP71]] to <2 x i32> -; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP70]], [[TMP81]] +; THR15-NEXT: [[TMP87:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; THR15-NEXT: [[TMP117:%.*]] = shufflevector <4 x i8> [[TMP87]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP117]] to <2 x i32> +; THR15-NEXT: [[TMP130:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; THR15-NEXT: [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP130]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP118:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32> +; THR15-NEXT: [[TMP93:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; THR15-NEXT: [[TMP146:%.*]] = shufflevector <4 x i8> [[TMP93]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP146]] to <2 x i32> +; THR15-NEXT: [[TMP72:%.*]] = sub <2 x i32> [[TMP118]], [[TMP120]] ; THR15-NEXT: [[TMP73:%.*]] = shl <2 x i32> [[TMP72]], -; THR15-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32> -; THR15-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP78:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32> -; THR15-NEXT: [[TMP79:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP80:%.*]] = zext <2 x i8> [[TMP79]] to <2 x i32> -; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]] +; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP87]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32> +; THR15-NEXT: [[TMP100:%.*]] = shufflevector <4 x i8> [[TMP130]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP143:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32> +; THR15-NEXT: [[TMP178:%.*]] = shufflevector <4 x i8> [[TMP93]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP147:%.*]] = zext <2 x i8> [[TMP178]] to <2 x i32> +; THR15-NEXT: [[TMP84:%.*]] = sub <2 x i32> [[TMP143]], [[TMP147]] ; THR15-NEXT: [[TMP85:%.*]] = shl <2 x i32> [[TMP84]], ; THR15-NEXT: [[TMP86:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV33]], i32 1 -; THR15-NEXT: [[TMP93:%.*]] = sub <2 x i32> [[TMP86]], [[TMP76]] -; THR15-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP93]] +; THR15-NEXT: [[TMP107:%.*]] = sub <2 x i32> [[TMP86]], [[TMP99]] +; THR15-NEXT: [[TMP88:%.*]] = add <2 x i32> [[TMP85]], [[TMP107]] ; THR15-NEXT: [[TMP92:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[CONV]], i32 0 -; THR15-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP92]], [[TMP68]] -; THR15-NEXT: [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP87]] +; THR15-NEXT: [[TMP110:%.*]] = sub <2 x i32> [[TMP92]], [[TMP111]] +; THR15-NEXT: [[TMP95:%.*]] = add <2 x i32> [[TMP73]], [[TMP110]] ; THR15-NEXT: [[TMP97:%.*]] = shufflevector <2 x i32> [[TMP88]], <2 x i32> [[TMP95]], <2 x i32> ; THR15-NEXT: [[TMP77:%.*]] = add <2 x i32> [[TMP88]], [[TMP95]] ; THR15-NEXT: [[TMP91:%.*]] = sub <2 x i32> [[TMP95]], [[TMP88]] @@ -492,10 +503,10 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP77]], i32 1 ; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP90]], [[TMP89]] ; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP89]], [[TMP90]] -; THR15-NEXT: [[TMP94:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0 +; THR15-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP91]], i32 0 ; THR15-NEXT: [[SUB47:%.*]] = extractelement <2 x i32> [[TMP91]], i32 1 -; THR15-NEXT: [[ADD56:%.*]] = add i32 [[SUB47]], [[TMP94]] -; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[TMP94]], [[SUB47]] +; THR15-NEXT: [[ADD56:%.*]] = add i32 [[SUB47]], [[TMP161]] +; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[TMP161]], [[SUB47]] ; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP90]], 15 ; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 ; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 @@ -504,38 +515,41 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 ; THR15-NEXT: [[TMP96:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 ; THR15-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> -; THR15-NEXT: [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR644]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32> -; THR15-NEXT: [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_1]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP104:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32> -; THR15-NEXT: [[TMP105:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_1]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP112:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32> -; THR15-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP104]], [[TMP112]] +; THR15-NEXT: [[TMP180:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; THR15-NEXT: [[TMP181:%.*]] = shufflevector <4 x i8> [[TMP180]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP175:%.*]] = zext <2 x i8> [[TMP181]] to <2 x i32> +; THR15-NEXT: [[TMP183:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; THR15-NEXT: [[TMP184:%.*]] = shufflevector <4 x i8> [[TMP183]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP177:%.*]] = zext <2 x i8> [[TMP184]] to <2 x i32> +; THR15-NEXT: [[TMP127:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; THR15-NEXT: [[TMP128:%.*]] = shufflevector <4 x i8> [[TMP127]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP129:%.*]] = zext <2 x i8> [[TMP128]] to <2 x i32> +; THR15-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP177]], [[TMP129]] ; THR15-NEXT: [[TMP102:%.*]] = shl <2 x i32> [[TMP101]], -; THR15-NEXT: [[TMP120:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22_1]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP120]] to <2 x i32> -; THR15-NEXT: [[TMP108:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_1]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP109:%.*]] = zext <2 x i8> [[TMP108]] to <2 x i32> -; THR15-NEXT: [[TMP110:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27_1]], i64 2, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP111:%.*]] = zext <2 x i8> [[TMP110]] to <2 x i32> -; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP109]], [[TMP111]] +; THR15-NEXT: [[TMP187:%.*]] = shufflevector <4 x i8> [[TMP180]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP182:%.*]] = zext <2 x i8> [[TMP187]] to <2 x i32> +; THR15-NEXT: [[TMP189:%.*]] = shufflevector <4 x i8> [[TMP183]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP185:%.*]] = zext <2 x i8> [[TMP189]] to <2 x i32> +; THR15-NEXT: [[TMP191:%.*]] = shufflevector <4 x i8> [[TMP127]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP186:%.*]] = zext <2 x i8> [[TMP191]] to <2 x i32> +; THR15-NEXT: [[TMP113:%.*]] = sub <2 x i32> [[TMP185]], [[TMP186]] ; THR15-NEXT: [[TMP114:%.*]] = shl <2 x i32> [[TMP113]], ; THR15-NEXT: [[TMP115:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV33_1]], i32 1 -; THR15-NEXT: [[TMP117:%.*]] = sub <2 x i32> [[TMP115]], [[TMP107]] -; THR15-NEXT: [[TMP116:%.*]] = add <2 x i32> [[TMP114]], [[TMP117]] +; THR15-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP115]], [[TMP182]] +; THR15-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP114]], [[TMP141]] ; THR15-NEXT: [[TMP126:%.*]] = insertelement <2 x i32> [[TMP103]], i32 [[CONV_1]], i32 0 -; THR15-NEXT: [[TMP127:%.*]] = sub <2 x i32> [[TMP126]], [[TMP99]] -; THR15-NEXT: [[TMP128:%.*]] = add <2 x i32> [[TMP102]], [[TMP127]] -; THR15-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP116]], [[TMP128]] -; THR15-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP128]], [[TMP116]] -; THR15-NEXT: [[TMP118:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0 +; THR15-NEXT: [[TMP188:%.*]] = sub <2 x i32> [[TMP126]], [[TMP175]] +; THR15-NEXT: [[TMP145:%.*]] = add <2 x i32> [[TMP102]], [[TMP188]] +; THR15-NEXT: [[TMP106:%.*]] = add <2 x i32> [[TMP142]], [[TMP145]] +; THR15-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP145]], [[TMP142]] +; THR15-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP106]], i32 0 ; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP106]], i32 1 -; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP119]], [[TMP118]] -; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP118]], [[TMP119]] -; THR15-NEXT: [[TMP129:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0 +; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP119]], [[TMP190]] +; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP190]], [[TMP119]] +; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP121]], i32 0 ; THR15-NEXT: [[TMP125:%.*]] = extractelement <2 x i32> [[TMP121]], i32 1 -; THR15-NEXT: [[ADD55_4:%.*]] = add i32 [[TMP125]], [[TMP129]] -; THR15-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP129]], [[TMP125]] +; THR15-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP125]], [[TMP150]] +; THR15-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP150]], [[TMP125]] ; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP119]], 15 ; THR15-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 ; THR15-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 @@ -551,7 +565,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] ; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP63]] ; THR15-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]] +; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP31]] ; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56_1]], [[SUB104]] ; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP119]] ; THR15-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] @@ -559,13 +573,13 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; THR15-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] ; THR15-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; THR15-NEXT: [[ADD55:%.*]] = add i32 [[ADD55_4]], [[ADD56]] -; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD56]], [[ADD55_4]] +; THR15-NEXT: [[ADD55:%.*]] = add i32 [[ADD55_2]], [[ADD56]] +; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD56]], [[ADD55_2]] ; THR15-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] ; THR15-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] ; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_1]] -; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP64]] -; THR15-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP66]], <2 x i32> [[TMP121]], <2 x i32> +; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP80]] +; THR15-NEXT: [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP21]], <2 x i32> [[TMP121]], <2 x i32> ; THR15-NEXT: [[TMP132:%.*]] = lshr <2 x i32> [[TMP5]], ; THR15-NEXT: [[TMP133:%.*]] = and <2 x i32> [[TMP132]], ; THR15-NEXT: [[TMP134:%.*]] = mul <2 x i32> [[TMP133]], @@ -581,8 +595,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] ; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[SUB47]] ; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] -; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0 -; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP150]] +; THR15-NEXT: [[TMP192:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0 +; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[TMP192]] ; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1 ; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[TMP151]] ; THR15-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] @@ -607,8 +621,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] ; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP89]] ; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; THR15-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0 -; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP161]] +; THR15-NEXT: [[TMP179:%.*]] = extractelement <2 x i32> [[TMP160]], i32 0 +; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP179]] ; THR15-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP160]], i32 1 ; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP162]] ; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] @@ -636,8 +650,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] ; THR15-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] ; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] -; THR15-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0 -; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP175]] +; THR15-NEXT: [[TMP193:%.*]] = extractelement <2 x i32> [[TMP174]], i32 0 +; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP193]] ; THR15-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP174]], i32 1 ; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP176]] ; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll index 3fa42047162e4..9c1da08c64b7b 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reversed-strided-node-with-external-ptr.ll @@ -10,10 +10,10 @@ define void @test(ptr %a, i64 %0) { ; CHECK-NEXT: br label %[[BB:.*]] ; CHECK: [[BB]]: ; CHECK-NEXT: [[TMP3:%.*]] = or disjoint i64 [[TMP0]], 1 -; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> [[TMP4]], i64 0, i32 1 ; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, <2 x ptr> [[TMP2]], <2 x i64> [[TMP5]] +; CHECK-NEXT: [[ARRAYIDX17_I28_1:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP3]] ; CHECK-NEXT: [[TMP7:%.*]] = call <2 x double> @llvm.masked.gather.v2f64.v2p0(<2 x ptr> [[TMP6]], i32 8, <2 x i1> , <2 x double> poison) ; CHECK-NEXT: [[TMP8:%.*]] = load <2 x double>, ptr [[A]], align 8 ; CHECK-NEXT: [[TMP9:%.*]] = load <2 x double>, ptr [[A]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll index 2daa3b58e5c3a..98333c7b420cf 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/scatter-vectorize-reversed.ll @@ -5,12 +5,12 @@ define <4 x i32> @test(<2 x i64> %v, ptr %p) { ; CHECK-LABEL: define <4 x i32> @test( ; CHECK-SAME: <2 x i64> [[V:%.*]], ptr [[P:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i64> [[V]], <2 x i64> poison, <2 x i32> ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x ptr> poison, ptr [[P]], i32 0 ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x ptr> [[TMP0]], <2 x ptr> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[V]] +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i16, <2 x ptr> [[TMP1]], <2 x i64> [[TMP4]] ; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i16> @llvm.masked.gather.v2i16.v2p0(<2 x ptr> [[TMP2]], i32 2, <2 x i1> , <2 x i16> poison) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = zext <2 x i16> [[TMP3]] to <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP7]], <2 x i32> poison, <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> zeroinitializer, <4 x i32> [[TMP6]], <4 x i32> ; CHECK-NEXT: ret <4 x i32> [[TMP5]] diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll index 54eb564768318..6876ca7fc351e 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll @@ -6,10 +6,11 @@ define void @test() { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 @src, i64 16, <4 x i1> , i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), i64 16, <4 x i1> , i32 4) -; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x double> [[TMP1]], [[TMP2]] -; CHECK-NEXT: store <4 x double> [[TMP3]], ptr @dst, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x double> [[TMP1]], <8 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = fsub fast <4 x double> [[TMP2]], [[TMP3]] +; CHECK-NEXT: store <4 x double> [[TMP4]], ptr @dst, align 8 ; CHECK-NEXT: ret void ; %a0 = load double, ptr @src, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll index af354bb06ad46..4de16a5d57793 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_dequeue.ll @@ -10,10 +10,8 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n ; CHECK-LABEL: @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[__FIRST:%.*]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__FIRST]], i64 0, i32 1 -; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[__LAST:%.*]], align 8 -; CHECK-NEXT: [[_M_FIRST3_I_I83:%.*]] = getelementptr inbounds %"struct.std::_Deque_iterator.4.157.174.208.259.276.344.731", ptr [[__LAST]], i64 0, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[_M_FIRST3_I_I83]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x ptr>, ptr [[__LAST:%.*]], align 8 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x ptr> [[TMP1]], ptr [[TMP0]], i32 0 ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT:%.*]], label [[WHILE_COND_I_PREHEADER:%.*]] ; CHECK: while.cond.i.preheader: ; CHECK-NEXT: br label [[WHILE_COND_I:%.*]] @@ -22,10 +20,8 @@ define void @_ZSt6uniqueISt15_Deque_iteratorIdRdPdEET_S4_S4_(ptr %__first, ptr n ; CHECK: while.body.i: ; CHECK-NEXT: br i1 undef, label [[_ZST13ADJACENT_FINDIST15_DEQUE_ITERATORIDRDPDEET_S4_S4__EXIT]], label [[WHILE_COND_I]] ; CHECK: _ZSt13adjacent_findISt15_Deque_iteratorIdRdPdEET_S4_S4_.exit: -; CHECK-NEXT: [[TMP3:%.*]] = phi ptr [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP2]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: [[TMP4:%.*]] = phi ptr [ [[TMP0]], [[ENTRY]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] -; CHECK-NEXT: store ptr [[TMP4]], ptr [[__FIRST]], align 8 -; CHECK-NEXT: store ptr [[TMP3]], ptr [[_M_FIRST3_I_I]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x ptr> [ [[TMP2]], [[ENTRY:%.*]] ], [ [[TMP1]], [[WHILE_COND_I]] ], [ undef, [[WHILE_BODY_I]] ] +; CHECK-NEXT: store <2 x ptr> [[TMP3]], ptr [[__FIRST]], align 8 ; CHECK-NEXT: br i1 undef, label [[IF_THEN_I55:%.*]], label [[WHILE_COND:%.*]] ; CHECK: if.then.i55: ; CHECK-NEXT: br label [[WHILE_COND]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll index b0d9fea43a0e6..d1f93eccc2a91 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-minmax.ll @@ -837,21 +837,18 @@ define i32 @maxi8_mutiple_uses(i32) { ; THRESH-NEXT: [[TMP5:%.*]] = icmp sgt i32 [[TMP3]], [[TMP4]] ; THRESH-NEXT: [[TMP6:%.*]] = select i1 [[TMP5]], i32 [[TMP3]], i32 [[TMP4]] ; THRESH-NEXT: [[TMP7:%.*]] = load <4 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 2), align 8 -; THRESH-NEXT: [[TMP8:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 -; THRESH-NEXT: [[TMP9:%.*]] = load i32, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 7), align 4 -; THRESH-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]]) -; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> poison, i32 [[TMP10]], i32 0 -; THRESH-NEXT: [[TMP12:%.*]] = insertelement <2 x i32> [[TMP11]], i32 [[TMP9]], i32 1 -; THRESH-NEXT: [[TMP13:%.*]] = insertelement <2 x i32> poison, i32 [[TMP8]], i32 0 -; THRESH-NEXT: [[TMP14:%.*]] = insertelement <2 x i32> [[TMP13]], i32 [[TMP6]], i32 1 -; THRESH-NEXT: [[TMP15:%.*]] = icmp sgt <2 x i32> [[TMP12]], [[TMP14]] -; THRESH-NEXT: [[TMP16:%.*]] = select <2 x i1> [[TMP15]], <2 x i32> [[TMP12]], <2 x i32> [[TMP14]] -; THRESH-NEXT: [[TMP17:%.*]] = extractelement <2 x i32> [[TMP16]], i32 0 -; THRESH-NEXT: [[TMP18:%.*]] = extractelement <2 x i32> [[TMP16]], i32 1 -; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP17]], [[TMP18]] -; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP17]], i32 [[TMP18]] -; THRESH-NEXT: [[TMP19:%.*]] = select i1 [[TMP5]], i32 3, i32 4 -; THRESH-NEXT: store i32 [[TMP19]], ptr @var, align 8 +; THRESH-NEXT: [[TMP8:%.*]] = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> [[TMP7]]) +; THRESH-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr getelementptr inbounds ([32 x i32], ptr @arr, i64 0, i64 6), align 8 +; THRESH-NEXT: [[TMP10:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP8]], i32 0 +; THRESH-NEXT: [[TMP11:%.*]] = insertelement <2 x i32> [[TMP9]], i32 [[TMP6]], i32 1 +; THRESH-NEXT: [[TMP12:%.*]] = icmp sgt <2 x i32> [[TMP10]], [[TMP11]] +; THRESH-NEXT: [[TMP13:%.*]] = select <2 x i1> [[TMP12]], <2 x i32> [[TMP10]], <2 x i32> [[TMP11]] +; THRESH-NEXT: [[TMP14:%.*]] = extractelement <2 x i32> [[TMP13]], i32 0 +; THRESH-NEXT: [[TMP15:%.*]] = extractelement <2 x i32> [[TMP13]], i32 1 +; THRESH-NEXT: [[OP_RDX4:%.*]] = icmp sgt i32 [[TMP14]], [[TMP15]] +; THRESH-NEXT: [[OP_RDX5:%.*]] = select i1 [[OP_RDX4]], i32 [[TMP14]], i32 [[TMP15]] +; THRESH-NEXT: [[TMP16:%.*]] = select i1 [[TMP5]], i32 3, i32 4 +; THRESH-NEXT: store i32 [[TMP16]], ptr @var, align 8 ; THRESH-NEXT: ret i32 [[OP_RDX5]] ; %2 = load i32, ptr @arr, align 16 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll index 9a41c1dc5de22..4f94784a24dd4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge-inseltpoison.ll @@ -100,21 +100,14 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16) define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byval( -; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2 -; CHECK-NEXT: [[T4:%.*]] = load i64, ptr [[T2]], align 8 -; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> poison, float [[T6]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 +; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 -; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 -; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float -; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 -; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 -; CHECK-NEXT: ret <4 x float> [[T15]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %t1 = load i64, ptr %x, align 16 %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll index bc8e6626e5508..700e3ed9effc4 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/load-merge.ll @@ -100,21 +100,14 @@ define <4 x float> @PR16739_byref_alt(ptr nocapture readonly dereferenceable(16) define <4 x float> @PR16739_byval(ptr nocapture readonly dereferenceable(16) %x) { ; CHECK-LABEL: @PR16739_byval( -; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[T2:%.*]] = getelementptr inbounds <4 x float>, ptr [[X]], i64 0, i64 2 -; CHECK-NEXT: [[T4:%.*]] = load i64, ptr [[T2]], align 8 -; CHECK-NEXT: [[T5:%.*]] = trunc i64 [[T1]] to i32 -; CHECK-NEXT: [[T6:%.*]] = bitcast i32 [[T5]] to float -; CHECK-NEXT: [[T7:%.*]] = insertelement <4 x float> undef, float [[T6]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr [[X:%.*]], align 16 +; CHECK-NEXT: [[T1:%.*]] = load i64, ptr [[X]], align 16 ; CHECK-NEXT: [[T8:%.*]] = lshr i64 [[T1]], 32 -; CHECK-NEXT: [[T9:%.*]] = trunc i64 [[T8]] to i32 -; CHECK-NEXT: [[T10:%.*]] = bitcast i32 [[T9]] to float -; CHECK-NEXT: [[T11:%.*]] = insertelement <4 x float> [[T7]], float [[T10]], i32 1 -; CHECK-NEXT: [[T12:%.*]] = trunc i64 [[T4]] to i32 -; CHECK-NEXT: [[T13:%.*]] = bitcast i32 [[T12]] to float -; CHECK-NEXT: [[T14:%.*]] = insertelement <4 x float> [[T11]], float [[T13]], i32 2 -; CHECK-NEXT: [[T15:%.*]] = insertelement <4 x float> [[T14]], float [[T13]], i32 3 -; CHECK-NEXT: ret <4 x float> [[T15]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[T8]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = trunc <4 x i64> [[TMP3]] to <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i32> [[TMP4]] to <4 x float> +; CHECK-NEXT: ret <4 x float> [[TMP5]] ; %t1 = load i64, ptr %x, align 16 %t2 = getelementptr inbounds <4 x float>, ptr %x, i64 0, i64 2 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll index 5a28581913b8c..c3122d991da20 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/lookahead.ll @@ -204,25 +204,21 @@ define void @lookahead_external_uses(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S, ptr ; CHECK-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[A:%.*]], i64 1 ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 -; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 -; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] -; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] +; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 ; CHECK-NEXT: ret void ; @@ -284,24 +280,22 @@ define void @lookahead_limit_users_budget(ptr %A, ptr %B, ptr %C, ptr %D, ptr %S ; CHECK-NEXT: [[IDXB2:%.*]] = getelementptr inbounds double, ptr [[B:%.*]], i64 2 ; CHECK-NEXT: [[IDXA2:%.*]] = getelementptr inbounds double, ptr [[A]], i64 2 ; CHECK-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1 -; CHECK-NEXT: [[B0:%.*]] = load double, ptr [[B]], align 8 ; CHECK-NEXT: [[C0:%.*]] = load double, ptr [[C:%.*]], align 8 ; CHECK-NEXT: [[D0:%.*]] = load double, ptr [[D:%.*]], align 8 ; CHECK-NEXT: [[B2:%.*]] = load double, ptr [[IDXB2]], align 8 ; CHECK-NEXT: [[A2:%.*]] = load double, ptr [[IDXA2]], align 8 -; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; CHECK-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 ; CHECK-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[A]], align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 +; CHECK-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[B]], align 8 ; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B2]], i32 1 ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP0]], [[TMP2]] ; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A2]], i32 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[D0]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[B1]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP8]] -; CHECK-NEXT: store <2 x double> [[TMP9]], ptr [[S:%.*]], align 8 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP1]], double [[D0]], i32 0 +; CHECK-NEXT: [[TMP7:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP6]] +; CHECK-NEXT: [[TMP8:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP7]] +; CHECK-NEXT: store <2 x double> [[TMP8]], ptr [[S:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT1:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT2:%.*]], align 8 ; CHECK-NEXT: store double [[A1]], ptr [[EXT3:%.*]], align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll index 5b33c6e889363..19cbce0767c92 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll @@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> , <4 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4 +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -533,161 +541,149 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 -; SSE-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; SSE-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; SSE-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 -; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 -; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2 -; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] -; SSE-NEXT: store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; SSE-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> +; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> +; SSE-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> +; SSE-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] +; SSE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; SSE-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; SSE-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 ; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 ; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 ; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 ; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 ; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 ; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3 -; SSE-NEXT: [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]] -; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3 +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]] +; SSE-NEXT: store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 -; AVX-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 -; AVX-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> +; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] +; AVX-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX2-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX2-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX2-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 -; AVX2-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX2-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX2-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> +; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX2-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] +; AVX2-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll index 09d6c77557efa..9ac4208c63285 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll @@ -180,12 +180,20 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado ; AVX512F-NEXT: ret void ; ; AVX512VL-LABEL: @gather_load_2( -; AVX512VL-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP1:%.*]], i64 0 -; AVX512VL-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr> [[TMP3]], <4 x ptr> poison, <4 x i32> zeroinitializer -; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr i32, <4 x ptr> [[TMP4]], <4 x i64> -; AVX512VL-NEXT: [[TMP6:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP5]], i32 4, <4 x i1> , <4 x i32> poison), !tbaa [[TBAA0]] -; AVX512VL-NEXT: [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], -; AVX512VL-NEXT: store <4 x i32> [[TMP7]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4 +; AVX512VL-NEXT: [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX512VL-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX512VL-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX512VL-NEXT: [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX512VL-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0 +; AVX512VL-NEXT: [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1 +; AVX512VL-NEXT: [[TMP13:%.*]] = insertelement <4 x i32> [[TMP12]], i32 [[TMP8]], i64 2 +; AVX512VL-NEXT: [[TMP14:%.*]] = insertelement <4 x i32> [[TMP13]], i32 [[TMP10]], i64 3 +; AVX512VL-NEXT: [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], +; AVX512VL-NEXT: store <4 x i32> [[TMP15]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX512VL-NEXT: ret void ; %3 = getelementptr inbounds i32, ptr %1, i64 1 @@ -533,161 +541,149 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) { ; SSE-LABEL: @gather_load_div( ; SSE-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 -; SSE-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; SSE-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; SSE-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; SSE-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 -; SSE-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 -; SSE-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; SSE-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 -; SSE-NEXT: [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 -; SSE-NEXT: [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1 -; SSE-NEXT: [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2 -; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP15]], i64 3 -; SSE-NEXT: [[TMP23:%.*]] = insertelement <4 x float> poison, float [[TMP5]], i64 0 -; SSE-NEXT: [[TMP24:%.*]] = insertelement <4 x float> [[TMP23]], float [[TMP9]], i64 1 -; SSE-NEXT: [[TMP25:%.*]] = insertelement <4 x float> [[TMP24]], float [[TMP13]], i64 2 -; SSE-NEXT: [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3 -; SSE-NEXT: [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]] -; SSE-NEXT: store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; SSE-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; SSE-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; SSE-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; SSE-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; SSE-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16 +; SSE-NEXT: [[TMP10:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP11:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP12:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP13:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0 +; SSE-NEXT: [[TMP14:%.*]] = shufflevector <2 x float> [[TMP11]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP15:%.*]] = shufflevector <4 x float> [[TMP13]], <4 x float> [[TMP14]], <4 x i32> +; SSE-NEXT: [[TMP16:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP17:%.*]] = shufflevector <4 x float> [[TMP15]], <4 x float> [[TMP16]], <4 x i32> +; SSE-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP12]], <2 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP19:%.*]] = shufflevector <4 x float> [[TMP17]], <4 x float> [[TMP18]], <4 x i32> +; SSE-NEXT: [[TMP20:%.*]] = shufflevector <2 x float> [[TMP10]], <2 x float> [[TMP12]], <4 x i32> +; SSE-NEXT: [[TMP21:%.*]] = shufflevector <4 x float> [[TMP20]], <4 x float> [[TMP14]], <4 x i32> +; SSE-NEXT: [[TMP22:%.*]] = insertelement <4 x float> [[TMP21]], float [[TMP8]], i64 3 +; SSE-NEXT: [[TMP23:%.*]] = fdiv <4 x float> [[TMP19]], [[TMP22]] +; SSE-NEXT: store <4 x float> [[TMP23]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; SSE-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; SSE-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 ; SSE-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; SSE-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 ; SSE-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; SSE-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 ; SSE-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; SSE-NEXT: [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 ; SSE-NEXT: [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; SSE-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 ; SSE-NEXT: [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; SSE-NEXT: [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 ; SSE-NEXT: [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; SSE-NEXT: [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; SSE-NEXT: [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]] -; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0 -; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1 -; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP37]], i64 2 -; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP41]], i64 3 -; SSE-NEXT: [[TMP48:%.*]] = insertelement <4 x float> poison, float [[TMP31]], i64 0 -; SSE-NEXT: [[TMP49:%.*]] = insertelement <4 x float> [[TMP48]], float [[TMP35]], i64 1 -; SSE-NEXT: [[TMP50:%.*]] = insertelement <4 x float> [[TMP49]], float [[TMP39]], i64 2 -; SSE-NEXT: [[TMP51:%.*]] = insertelement <4 x float> [[TMP50]], float [[TMP43]], i64 3 -; SSE-NEXT: [[TMP52:%.*]] = fdiv <4 x float> [[TMP47]], [[TMP51]] -; SSE-NEXT: store <4 x float> [[TMP52]], ptr [[TMP18]], align 4, !tbaa [[TBAA0]] +; SSE-NEXT: [[TMP40:%.*]] = insertelement <4 x float> poison, float [[TMP25]], i64 0 +; SSE-NEXT: [[TMP41:%.*]] = insertelement <4 x float> [[TMP40]], float [[TMP29]], i64 1 +; SSE-NEXT: [[TMP42:%.*]] = insertelement <4 x float> [[TMP41]], float [[TMP33]], i64 2 +; SSE-NEXT: [[TMP43:%.*]] = insertelement <4 x float> [[TMP42]], float [[TMP37]], i64 3 +; SSE-NEXT: [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP27]], i64 0 +; SSE-NEXT: [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP31]], i64 1 +; SSE-NEXT: [[TMP46:%.*]] = insertelement <4 x float> [[TMP45]], float [[TMP35]], i64 2 +; SSE-NEXT: [[TMP47:%.*]] = insertelement <4 x float> [[TMP46]], float [[TMP39]], i64 3 +; SSE-NEXT: [[TMP48:%.*]] = fdiv <4 x float> [[TMP43]], [[TMP47]] +; SSE-NEXT: store <4 x float> [[TMP48]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]] ; SSE-NEXT: ret void ; ; AVX-LABEL: @gather_load_div( ; AVX-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 -; AVX-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 -; AVX-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 -; AVX-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 +; AVX-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 +; AVX-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 +; AVX-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> +; AVX-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> +; AVX-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 +; AVX-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 +; AVX-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 +; AVX-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 +; AVX-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] +; AVX-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX-NEXT: ret void ; ; AVX2-LABEL: @gather_load_div( ; AVX2-NEXT: [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16 -; AVX2-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 -; AVX2-NEXT: [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 -; AVX2-NEXT: [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 -; AVX2-NEXT: [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44 -; AVX2-NEXT: [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56 -; AVX2-NEXT: [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 -; AVX2-NEXT: [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 -; AVX2-NEXT: [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 -; AVX2-NEXT: [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 -; AVX2-NEXT: [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 -; AVX2-NEXT: [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 -; AVX2-NEXT: [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 -; AVX2-NEXT: [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 -; AVX2-NEXT: [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 -; AVX2-NEXT: [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]] -; AVX2-NEXT: [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 -; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1 -; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP11]], i64 2 -; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP15]], i64 3 -; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP19]], i64 4 -; AVX2-NEXT: [[TMP39:%.*]] = insertelement <8 x float> [[TMP38]], float [[TMP23]], i64 5 -; AVX2-NEXT: [[TMP40:%.*]] = insertelement <8 x float> [[TMP39]], float [[TMP27]], i64 6 -; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP31]], i64 7 -; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> poison, float [[TMP5]], i64 0 -; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP9]], i64 1 -; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP13]], i64 2 -; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP17]], i64 3 -; AVX2-NEXT: [[TMP46:%.*]] = insertelement <8 x float> [[TMP45]], float [[TMP21]], i64 4 -; AVX2-NEXT: [[TMP47:%.*]] = insertelement <8 x float> [[TMP46]], float [[TMP25]], i64 5 -; AVX2-NEXT: [[TMP48:%.*]] = insertelement <8 x float> [[TMP47]], float [[TMP29]], i64 6 -; AVX2-NEXT: [[TMP49:%.*]] = insertelement <8 x float> [[TMP48]], float [[TMP33]], i64 7 -; AVX2-NEXT: [[TMP50:%.*]] = fdiv <8 x float> [[TMP41]], [[TMP49]] -; AVX2-NEXT: store <8 x float> [[TMP50]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40 +; AVX2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52 +; AVX2-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12 +; AVX2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176 +; AVX2-NEXT: [[TMP8:%.*]] = load float, ptr [[TMP7]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68 +; AVX2-NEXT: [[TMP10:%.*]] = load float, ptr [[TMP9]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132 +; AVX2-NEXT: [[TMP12:%.*]] = load float, ptr [[TMP11]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32 +; AVX2-NEXT: [[TMP14:%.*]] = load float, ptr [[TMP13]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120 +; AVX2-NEXT: [[TMP16:%.*]] = load float, ptr [[TMP15]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20 +; AVX2-NEXT: [[TMP18:%.*]] = load float, ptr [[TMP17]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108 +; AVX2-NEXT: [[TMP20:%.*]] = load float, ptr [[TMP19]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80 +; AVX2-NEXT: [[TMP22:%.*]] = load float, ptr [[TMP21]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92 +; AVX2-NEXT: [[TMP24:%.*]] = load float, ptr [[TMP23]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP25:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP26:%.*]] = load <2 x float>, ptr [[TMP4]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP27:%.*]] = load <2 x float>, ptr [[TMP5]], align 4, !tbaa [[TBAA0]] +; AVX2-NEXT: [[TMP28:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0 +; AVX2-NEXT: [[TMP29:%.*]] = shufflevector <2 x float> [[TMP26]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP30:%.*]] = shufflevector <8 x float> [[TMP28]], <8 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP31:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP32:%.*]] = shufflevector <8 x float> [[TMP30]], <8 x float> [[TMP31]], <8 x i32> +; AVX2-NEXT: [[TMP33:%.*]] = shufflevector <2 x float> [[TMP27]], <2 x float> poison, <8 x i32> +; AVX2-NEXT: [[TMP34:%.*]] = shufflevector <8 x float> [[TMP32]], <8 x float> [[TMP33]], <8 x i32> +; AVX2-NEXT: [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP10]], i64 4 +; AVX2-NEXT: [[TMP36:%.*]] = insertelement <8 x float> [[TMP35]], float [[TMP14]], i64 5 +; AVX2-NEXT: [[TMP37:%.*]] = insertelement <8 x float> [[TMP36]], float [[TMP18]], i64 6 +; AVX2-NEXT: [[TMP38:%.*]] = insertelement <8 x float> [[TMP37]], float [[TMP22]], i64 7 +; AVX2-NEXT: [[TMP39:%.*]] = shufflevector <2 x float> [[TMP25]], <2 x float> [[TMP27]], <8 x i32> +; AVX2-NEXT: [[TMP40:%.*]] = shufflevector <8 x float> [[TMP39]], <8 x float> [[TMP29]], <8 x i32> +; AVX2-NEXT: [[TMP41:%.*]] = insertelement <8 x float> [[TMP40]], float [[TMP8]], i64 3 +; AVX2-NEXT: [[TMP42:%.*]] = insertelement <8 x float> [[TMP41]], float [[TMP12]], i64 4 +; AVX2-NEXT: [[TMP43:%.*]] = insertelement <8 x float> [[TMP42]], float [[TMP16]], i64 5 +; AVX2-NEXT: [[TMP44:%.*]] = insertelement <8 x float> [[TMP43]], float [[TMP20]], i64 6 +; AVX2-NEXT: [[TMP45:%.*]] = insertelement <8 x float> [[TMP44]], float [[TMP24]], i64 7 +; AVX2-NEXT: [[TMP46:%.*]] = fdiv <8 x float> [[TMP38]], [[TMP45]] +; AVX2-NEXT: store <8 x float> [[TMP46]], ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0]] ; AVX2-NEXT: ret void ; ; AVX512F-LABEL: @gather_load_div( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll index 249b51592760c..92a4095c7c57a 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/pr48879-sroa.ll @@ -71,120 +71,66 @@ define { i64, i64 } @compute_min(ptr nocapture noundef nonnull readonly align 2 ; ; AVX-LABEL: @compute_min( ; AVX-NEXT: entry: -; AVX-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2 -; AVX-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2 -; AVX-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]]) -; AVX-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1 -; AVX-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1 -; AVX-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 -; AVX-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 -; AVX-NEXT: [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]]) -; AVX-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 -; AVX-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 -; AVX-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_2]], align 2 -; AVX-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_2]], align 2 -; AVX-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]]) -; AVX-NEXT: [[ARRAYIDX_I_I_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 3 -; AVX-NEXT: [[ARRAYIDX_I_I10_3:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 3 -; AVX-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_3]], align 2 -; AVX-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_3]], align 2 -; AVX-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]]) -; AVX-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 -; AVX-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 -; AVX-NEXT: [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 -; AVX-NEXT: [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2 -; AVX-NEXT: [[TMP14:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP12]], i16 [[TMP13]]) -; AVX-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 -; AVX-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 -; AVX-NEXT: [[TMP15:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 -; AVX-NEXT: [[TMP16:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 -; AVX-NEXT: [[TMP17:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP15]], i16 [[TMP16]]) -; AVX-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 -; AVX-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 -; AVX-NEXT: [[TMP18:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_6]], align 2 -; AVX-NEXT: [[TMP19:%.*]] = load i16, ptr [[ARRAYIDX_I_I_6]], align 2 -; AVX-NEXT: [[TMP20:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP18]], i16 [[TMP19]]) -; AVX-NEXT: [[ARRAYIDX_I_I_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 7 -; AVX-NEXT: [[ARRAYIDX_I_I10_7:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 7 -; AVX-NEXT: [[TMP21:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_7]], align 2 -; AVX-NEXT: [[TMP22:%.*]] = load i16, ptr [[ARRAYIDX_I_I_7]], align 2 -; AVX-NEXT: [[TMP23:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP21]], i16 [[TMP22]]) -; AVX-NEXT: [[RETVAL_SROA_4_0_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 -; AVX-NEXT: [[RETVAL_SROA_4_0_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_4_0_INSERT_EXT]], 48 -; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64 -; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_3_0_INSERT_EXT]], 32 -; AVX-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_4_0_INSERT_SHIFT]], [[RETVAL_SROA_3_0_INSERT_SHIFT]] -; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 -; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 -; AVX-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] -; AVX-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64 -; AVX-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] -; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 -; AVX-NEXT: [[RETVAL_SROA_9_8_INSERT_EXT:%.*]] = zext i16 [[TMP23]] to i64 -; AVX-NEXT: [[RETVAL_SROA_9_8_INSERT_SHIFT:%.*]] = shl nuw i64 [[RETVAL_SROA_9_8_INSERT_EXT]], 48 -; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_EXT:%.*]] = zext i16 [[TMP20]] to i64 -; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_8_8_INSERT_EXT]], 32 -; AVX-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_9_8_INSERT_SHIFT]], [[RETVAL_SROA_8_8_INSERT_SHIFT]] -; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP17]] to i64 -; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 -; AVX-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] -; AVX-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP14]] to i64 -; AVX-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] -; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 +; AVX-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 +; AVX-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]]) +; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]]) +; AVX-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]]) +; AVX-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]]) +; AVX-NEXT: [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64> +; AVX-NEXT: [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], +; AVX-NEXT: [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64> +; AVX-NEXT: [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], +; AVX-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]] +; AVX-NEXT: [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> +; AVX-NEXT: [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], +; AVX-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]] +; AVX-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> +; AVX-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]] +; AVX-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 +; AVX-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 +; AVX-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 +; AVX-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1 ; AVX-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; ; AVX2-LABEL: @compute_min( ; AVX2-NEXT: entry: -; AVX2-NEXT: [[TMP0:%.*]] = load i16, ptr [[Y:%.*]], align 2 -; AVX2-NEXT: [[TMP1:%.*]] = load i16, ptr [[X:%.*]], align 2 -; AVX2-NEXT: [[TMP2:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP0]], i16 [[TMP1]]) -; AVX2-NEXT: [[ARRAYIDX_I_I_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 1 -; AVX2-NEXT: [[ARRAYIDX_I_I10_1:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 1 -; AVX2-NEXT: [[TMP3:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_1]], align 2 -; AVX2-NEXT: [[TMP4:%.*]] = load i16, ptr [[ARRAYIDX_I_I_1]], align 2 -; AVX2-NEXT: [[TMP5:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP3]], i16 [[TMP4]]) -; AVX2-NEXT: [[ARRAYIDX_I_I_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 2 -; AVX2-NEXT: [[ARRAYIDX_I_I10_2:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 2 -; AVX2-NEXT: [[ARRAYIDX_I_I_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 4 -; AVX2-NEXT: [[ARRAYIDX_I_I10_4:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 4 -; AVX2-NEXT: [[TMP6:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_4]], align 2 -; AVX2-NEXT: [[TMP7:%.*]] = load i16, ptr [[ARRAYIDX_I_I_4]], align 2 -; AVX2-NEXT: [[TMP8:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP6]], i16 [[TMP7]]) -; AVX2-NEXT: [[ARRAYIDX_I_I_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 5 -; AVX2-NEXT: [[ARRAYIDX_I_I10_5:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 5 -; AVX2-NEXT: [[TMP9:%.*]] = load i16, ptr [[ARRAYIDX_I_I10_5]], align 2 -; AVX2-NEXT: [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX_I_I_5]], align 2 -; AVX2-NEXT: [[TMP11:%.*]] = tail call i16 @llvm.smin.i16(i16 [[TMP9]], i16 [[TMP10]]) -; AVX2-NEXT: [[ARRAYIDX_I_I_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[X]], i64 0, i64 6 -; AVX2-NEXT: [[ARRAYIDX_I_I10_6:%.*]] = getelementptr inbounds [8 x i16], ptr [[Y]], i64 0, i64 6 -; AVX2-NEXT: [[TMP12:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_2]], align 2 -; AVX2-NEXT: [[TMP13:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_2]], align 2 -; AVX2-NEXT: [[TMP14:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP12]], <2 x i16> [[TMP13]]) -; AVX2-NEXT: [[TMP15:%.*]] = zext <2 x i16> [[TMP14]] to <2 x i64> -; AVX2-NEXT: [[TMP16:%.*]] = shl nuw <2 x i64> [[TMP15]], -; AVX2-NEXT: [[TMP17:%.*]] = extractelement <2 x i64> [[TMP16]], i32 0 -; AVX2-NEXT: [[TMP18:%.*]] = extractelement <2 x i64> [[TMP16]], i32 1 -; AVX2-NEXT: [[RETVAL_SROA_3_0_INSERT_INSERT:%.*]] = or i64 [[TMP18]], [[TMP17]] -; AVX2-NEXT: [[RETVAL_SROA_2_0_INSERT_EXT:%.*]] = zext i16 [[TMP5]] to i64 -; AVX2-NEXT: [[RETVAL_SROA_2_0_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_2_0_INSERT_EXT]], 16 -; AVX2-NEXT: [[RETVAL_SROA_2_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_3_0_INSERT_INSERT]], [[RETVAL_SROA_2_0_INSERT_SHIFT]] -; AVX2-NEXT: [[RETVAL_SROA_0_0_INSERT_EXT:%.*]] = zext i16 [[TMP2]] to i64 -; AVX2-NEXT: [[RETVAL_SROA_0_0_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_2_0_INSERT_INSERT]], [[RETVAL_SROA_0_0_INSERT_EXT]] -; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[RETVAL_SROA_0_0_INSERT_INSERT]], 0 -; AVX2-NEXT: [[TMP19:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I10_6]], align 2 -; AVX2-NEXT: [[TMP20:%.*]] = load <2 x i16>, ptr [[ARRAYIDX_I_I_6]], align 2 -; AVX2-NEXT: [[TMP21:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP19]], <2 x i16> [[TMP20]]) -; AVX2-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP21]] to <2 x i64> -; AVX2-NEXT: [[TMP23:%.*]] = shl nuw <2 x i64> [[TMP22]], +; AVX2-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[Y:%.*]], align 2 +; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[X:%.*]], align 2 +; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP4:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP2]], <2 x i16> [[TMP3]]) +; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP7:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]]) +; AVX2-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP9:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP10:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP8]], <2 x i16> [[TMP9]]) +; AVX2-NEXT: [[TMP11:%.*]] = shufflevector <8 x i16> [[TMP0]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <2 x i32> +; AVX2-NEXT: [[TMP13:%.*]] = call <2 x i16> @llvm.smin.v2i16(<2 x i16> [[TMP11]], <2 x i16> [[TMP12]]) +; AVX2-NEXT: [[TMP14:%.*]] = zext <2 x i16> [[TMP13]] to <2 x i64> +; AVX2-NEXT: [[TMP15:%.*]] = shl nuw <2 x i64> [[TMP14]], +; AVX2-NEXT: [[TMP16:%.*]] = zext <2 x i16> [[TMP10]] to <2 x i64> +; AVX2-NEXT: [[TMP17:%.*]] = shl nuw <2 x i64> [[TMP16]], +; AVX2-NEXT: [[TMP18:%.*]] = or <2 x i64> [[TMP15]], [[TMP17]] +; AVX2-NEXT: [[TMP19:%.*]] = zext <2 x i16> [[TMP7]] to <2 x i64> +; AVX2-NEXT: [[TMP20:%.*]] = shl nuw nsw <2 x i64> [[TMP19]], +; AVX2-NEXT: [[TMP21:%.*]] = or <2 x i64> [[TMP18]], [[TMP20]] +; AVX2-NEXT: [[TMP22:%.*]] = zext <2 x i16> [[TMP4]] to <2 x i64> +; AVX2-NEXT: [[TMP23:%.*]] = or <2 x i64> [[TMP21]], [[TMP22]] ; AVX2-NEXT: [[TMP24:%.*]] = extractelement <2 x i64> [[TMP23]], i32 0 +; AVX2-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue { i64, i64 } poison, i64 [[TMP24]], 0 ; AVX2-NEXT: [[TMP25:%.*]] = extractelement <2 x i64> [[TMP23]], i32 1 -; AVX2-NEXT: [[RETVAL_SROA_8_8_INSERT_INSERT:%.*]] = or i64 [[TMP25]], [[TMP24]] -; AVX2-NEXT: [[RETVAL_SROA_7_8_INSERT_EXT:%.*]] = zext i16 [[TMP11]] to i64 -; AVX2-NEXT: [[RETVAL_SROA_7_8_INSERT_SHIFT:%.*]] = shl nuw nsw i64 [[RETVAL_SROA_7_8_INSERT_EXT]], 16 -; AVX2-NEXT: [[RETVAL_SROA_7_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_8_8_INSERT_INSERT]], [[RETVAL_SROA_7_8_INSERT_SHIFT]] -; AVX2-NEXT: [[RETVAL_SROA_5_8_INSERT_EXT:%.*]] = zext i16 [[TMP8]] to i64 -; AVX2-NEXT: [[RETVAL_SROA_5_8_INSERT_INSERT:%.*]] = or i64 [[RETVAL_SROA_7_8_INSERT_INSERT]], [[RETVAL_SROA_5_8_INSERT_EXT]] -; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[RETVAL_SROA_5_8_INSERT_INSERT]], 1 +; AVX2-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue { i64, i64 } [[DOTFCA_0_INSERT]], i64 [[TMP25]], 1 ; AVX2-NEXT: ret { i64, i64 } [[DOTFCA_1_INSERT]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll index 8b2b15283601a..c0835fe56f727 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction_loads.ll @@ -25,10 +25,10 @@ define i32 @test(ptr nocapture readonly %p) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[SUM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = mul <8 x i32> [[TMP0]], +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP1]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP2]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 [[OP_RDX]] @@ -97,11 +97,11 @@ define i32 @test2(ptr nocapture readonly %p, ptr nocapture readonly %q) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP5]], [[SUM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = mul <8 x i32> [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP2]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP3]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 [[OP_RDX]] @@ -186,12 +186,12 @@ define i32 @test3(ptr nocapture readonly %p, ptr nocapture readonly %q) { ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[SUM:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[OP_RDX:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP4:%.*]] = mul <8 x i32> [[TMP1]], [[SHUFFLE]] -; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP4]]) -; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP5]], [[SUM]] +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x i32>, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr [[Q:%.*]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = mul <8 x i32> [[TMP0]], [[TMP2]] +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[TMP3]]) +; CHECK-NEXT: [[OP_RDX]] = add i32 [[TMP4]], [[SUM]] ; CHECK-NEXT: br i1 true, label [[FOR_END:%.*]], label [[FOR_BODY]] ; CHECK: for.end: ; CHECK-NEXT: ret i32 [[OP_RDX]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll index 83457cc4966f7..729d5fd5546dc 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll @@ -10,19 +10,39 @@ declare void @llvm.masked.scatter.v2f64.v2p0(<2 x double>, <2 x ptr>, i32 immarg define void @test(ptr nocapture readonly %arg, ptr nocapture readonly %arg1, ptr nocapture %arg2) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ARG:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, <8 x ptr> [[SHUFFLE]], <8 x i64> +; CHECK-NEXT: [[GEP1_0:%.*]] = getelementptr inbounds double, ptr [[ARG:%.*]], i64 1 +; CHECK-NEXT: [[LD1_0:%.*]] = load double, ptr [[GEP1_0]], align 8 ; CHECK-NEXT: [[GEP2_0:%.*]] = getelementptr inbounds double, ptr [[ARG1:%.*]], i64 16 -; CHECK-NEXT: [[TMP2:%.*]] = call <8 x double> @llvm.masked.gather.v8f64.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> , <8 x double> poison) -; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8 -; CHECK-NEXT: [[TMP5:%.*]] = fmul fast <8 x double> [[TMP4]], [[TMP2]] -; CHECK-NEXT: [[TMP7:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = fmul fast <8 x double> [[TMP7]], [[TMP2]] -; CHECK-NEXT: [[TMP9:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP5]]) -; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP9]], i64 0 -; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP10]], i64 1 +; CHECK-NEXT: [[GEP1_1:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 3 +; CHECK-NEXT: [[LD1_1:%.*]] = load double, ptr [[GEP1_1]], align 8 +; CHECK-NEXT: [[GEP1_2:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 5 +; CHECK-NEXT: [[LD1_2:%.*]] = load double, ptr [[GEP1_2]], align 8 +; CHECK-NEXT: [[GEP1_3:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 7 +; CHECK-NEXT: [[LD1_3:%.*]] = load double, ptr [[GEP1_3]], align 8 +; CHECK-NEXT: [[GEP1_4:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 9 +; CHECK-NEXT: [[LD1_4:%.*]] = load double, ptr [[GEP1_4]], align 8 +; CHECK-NEXT: [[GEP1_5:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 11 +; CHECK-NEXT: [[LD1_5:%.*]] = load double, ptr [[GEP1_5]], align 8 +; CHECK-NEXT: [[GEP1_6:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 13 +; CHECK-NEXT: [[LD1_6:%.*]] = load double, ptr [[GEP1_6]], align 8 +; CHECK-NEXT: [[GEP1_7:%.*]] = getelementptr inbounds double, ptr [[ARG]], i64 15 +; CHECK-NEXT: [[LD1_7:%.*]] = load double, ptr [[GEP1_7]], align 8 +; CHECK-NEXT: [[TMP0:%.*]] = load <8 x double>, ptr [[ARG1]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x double> poison, double [[LD1_0]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x double> [[TMP1]], double [[LD1_1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x double> [[TMP2]], double [[LD1_2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x double> [[TMP3]], double [[LD1_3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x double> [[TMP4]], double [[LD1_4]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[LD1_5]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x double> [[TMP6]], double [[LD1_6]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> [[TMP7]], double [[LD1_7]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = fmul fast <8 x double> [[TMP0]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = load <8 x double>, ptr [[GEP2_0]], align 8 +; CHECK-NEXT: [[TMP12:%.*]] = fmul fast <8 x double> [[TMP11]], [[TMP8]] +; CHECK-NEXT: [[TMP13:%.*]] = call fast double @llvm.vector.reduce.fadd.v8f64(double -0.000000e+00, <8 x double> [[TMP12]]) +; CHECK-NEXT: [[I142:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0 +; CHECK-NEXT: [[I143:%.*]] = insertelement <2 x double> [[I142]], double [[TMP13]], i64 1 ; CHECK-NEXT: [[P:%.*]] = getelementptr inbounds double, ptr [[ARG2:%.*]], <2 x i64> ; CHECK-NEXT: call void @llvm.masked.scatter.v2f64.v2p0(<2 x double> [[I143]], <2 x ptr> [[P]], i32 8, <2 x i1> ) ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll index 40dcc79f79ffc..09a5ace101e64 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark-masked-loads-consecutive-loads-same-ptr.ll @@ -8,19 +8,23 @@ ; YAML-NEXT: Function: test ; YAML-NEXT: Args: ; YAML-NEXT: - String: 'Stores SLP vectorized with cost ' -; YAML-NEXT: - Cost: '-5' +; YAML-NEXT: - Cost: '-7' ; YAML-NEXT: - String: ' and with tree size ' -; YAML-NEXT: - TreeSize: '7' +; YAML-NEXT: - TreeSize: '5' define void @test(ptr noalias %p, ptr noalias %p1) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x ptr> poison, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x ptr> [[TMP0]], <4 x ptr> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <4 x ptr> [[TMP1]], <4 x i64> -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP2]], i32 4, <4 x i1> , <4 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i32>, ptr [[P]], align 4 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP3]], [[TMP4]] +; CHECK-NEXT: [[I:%.*]] = load i32, ptr [[P:%.*]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr i32, ptr [[P]], i64 32 +; CHECK-NEXT: [[I2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr i32, ptr [[P]], i64 33 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX11]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> poison, i32 [[I]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[I2]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.vector.insert.v4i32.v2i32(<4 x i32> [[TMP3]], <2 x i32> [[TMP0]], i64 2) +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP1]] ; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr [[P1:%.*]], align 4 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll index 26c4d55436d22..7f5d803391343 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/remark_gather-load-redux-cost.ll @@ -24,7 +24,7 @@ define i32 @test(ptr noalias %p, ptr noalias %addr) { ; YAML-NEXT: - String: 'Vectorized horizontal reduction with cost ' ; YAML-NEXT: - Cost: '-1' ; YAML-NEXT: - String: ' and with tree size ' - ; YAML-NEXT: - TreeSize: '7' + ; YAML-NEXT: - TreeSize: '4' entry: %off0.1 = getelementptr inbounds i32, ptr %addr, i32 1 %idx0 = load i32, ptr %off0.1, align 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll index 787bd39759dc7..228967e63d1ef 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-phi-operand.ll @@ -9,11 +9,11 @@ define ptr @test() { ; CHECK-NEXT: [[TMP1:%.*]] = fpext <2 x float> [[TMP0]] to <2 x double> ; CHECK-NEXT: br label [[BODY:%.*]] ; CHECK: body: -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP5:%.*]], [[BODY]] ], [ [[TMP1]], [[ENTRY:%.*]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x double> [ [[TMP6:%.*]], [[BODY]] ], [ [[TMP1]], [[ENTRY:%.*]] ] ; CHECK-NEXT: [[TMP3:%.*]] = load <2 x i16>, ptr null, align 2 ; CHECK-NEXT: [[TMP4:%.*]] = uitofp <2 x i16> [[TMP3]] to <2 x double> -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> -; CHECK-NEXT: [[TMP5]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> [[SHUFFLE]], <2 x double> [[TMP2]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <2 x i32> +; CHECK-NEXT: [[TMP6]] = call <2 x double> @llvm.fmuladd.v2f64(<2 x double> zeroinitializer, <2 x double> [[TMP5]], <2 x double> [[TMP2]]) ; CHECK-NEXT: br label [[BODY]] ; entry: @@ -54,8 +54,8 @@ define void @test1(ptr %agg.result, ptr %this) { ; CHECK: return: ; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ [[TMP1]], [[IF_END]] ], [ , [[LOR_LHS_FALSE]] ], [ , [[ENTRY:%.*]] ] ; CHECK-NEXT: [[C_I_I_I:%.*]] = getelementptr inbounds float, ptr [[AGG_RESULT:%.*]], i32 2 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: store <2 x float> [[SHUFFLE]], ptr [[C_I_I_I]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: store <2 x float> [[TMP3]], ptr [[C_I_I_I]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll index cfbbe14186b50..8786e1a92a326 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-possible-strided-node.ll @@ -5,13 +5,20 @@ define void @test() { ; CHECK-LABEL: define void @test( ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[TMP7]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[TMP6]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = ashr <4 x i32> [[TMP3]], zeroinitializer -; CHECK-NEXT: store <4 x i32> [[TMP4]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP5]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; entry: @@ -57,15 +64,22 @@ define void @test1() { ; CHECK-LABEL: define void @test1( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <4 x i32> [[TMP10]], i32 [[TMP9]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP8]], i32 2 +; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]] ; CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i32> [[TMP3]] to <4 x i64> ; CHECK-NEXT: [[TMP5:%.*]] = lshr <4 x i64> [[TMP4]], zeroinitializer ; CHECK-NEXT: [[TMP6:%.*]] = trunc <4 x i64> [[TMP5]] to <4 x i32> -; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: store <4 x i32> [[TMP7]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; entry: @@ -111,12 +125,19 @@ define void @test_div() { ; CHECK-LABEL: define void @test_div( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = udiv <4 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = udiv <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; @@ -163,12 +184,19 @@ define void @test_rem() { ; CHECK-LABEL: define void @test_rem( ; CHECK-SAME: ) #[[ATTR0]] { ; CHECK-NEXT: entry: +; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i32, ptr null, i64 33 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 +; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i32, ptr null, i64 7 +; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i32, ptr null, i64 60 -; CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> getelementptr (i32, <4 x ptr> zeroinitializer, <4 x i64> ), i32 4, <4 x i1> , <4 x i32> poison) +; CHECK-NEXT: [[TMP9:%.*]] = load <2 x i32>, ptr null, align 4 ; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[ARRAYIDX22]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = mul <4 x i32> [[TMP2]], [[TMP0]] -; CHECK-NEXT: [[TMP6:%.*]] = urem <4 x i32> [[TMP3]], +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP9]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP8]], i32 1 +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP7]], i32 2 +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i32> [[TMP1]], [[TMP0]] +; CHECK-NEXT: [[TMP3:%.*]] = urem <4 x i32> [[TMP2]], +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> ; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr getelementptr inbounds ([4 x i32], ptr null, i64 8, i64 0), align 16 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll index 30f328293cdaa..c114c5dee78e9 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder-reused-masked-gather2.ll @@ -6,18 +6,20 @@ target triple = "x86_64-unknown-linux-gnu" define void @"foo"(ptr addrspace(1) %0, ptr addrspace(1) %1) #0 { ; CHECK-LABEL: @foo( -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x ptr addrspace(1)> poison, ptr addrspace(1) [[TMP0:%.*]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <4 x ptr addrspace(1)> [[TMP3]], <4 x ptr addrspace(1)> poison, <4 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i8, <4 x ptr addrspace(1)> [[TMP4]], <4 x i64> -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 8 -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 -; CHECK-NEXT: [[TMP8:%.*]] = call <4 x float> @llvm.masked.gather.v4f32.v4p1(<4 x ptr addrspace(1)> [[TMP5]], i32 4, <4 x i1> , <4 x float> poison) -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> poison, <8 x i32> -; CHECK-NEXT: [[TMP10:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP7]], align 4 -; CHECK-NEXT: [[TMP11:%.*]] = fmul <8 x float> [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP12:%.*]] = fadd <8 x float> [[TMP11]], zeroinitializer -; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> poison, <8 x i32> -; CHECK-NEXT: store <8 x float> [[TMP13]], ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0:%.*]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP1:%.*]], i64 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 24 +; CHECK-NEXT: [[TMP6:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP3]], align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load <2 x float>, ptr addrspace(1) [[TMP5]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <2 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = load <8 x float>, ptr addrspace(1) [[TMP4]], align 4 +; CHECK-NEXT: [[TMP10:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> poison, <2 x float> [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP11:%.*]] = call <4 x float> @llvm.vector.insert.v4f32.v2f32(<4 x float> [[TMP10]], <2 x float> [[TMP8]], i64 2) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x float> [[TMP11]], <4 x float> poison, <8 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = fmul <8 x float> [[TMP12]], [[TMP9]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd <8 x float> [[TMP13]], zeroinitializer +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <8 x float> [[TMP14]], <8 x float> poison, <8 x i32> +; CHECK-NEXT: store <8 x float> [[TMP15]], ptr addrspace(1) [[TMP3]], align 4 ; CHECK-NEXT: ret void ; %3 = getelementptr inbounds i8, ptr addrspace(1) %0, i64 8 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll index f0e734d8c5aef..2658317e97927 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/reorder_phi.ll @@ -9,8 +9,8 @@ define void @foo (ptr %A, ptr %B, ptr %Result) { ; CHECK-NEXT: [[TMP0:%.*]] = add i64 256, 0 ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP18:%.*]], [[LOOP]] ] -; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP20:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP1:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[TMP20:%.*]], [[LOOP]] ] +; CHECK-NEXT: [[TMP2:%.*]] = phi <2 x float> [ zeroinitializer, [[ENTRY]] ], [ [[TMP19:%.*]], [[LOOP]] ] ; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX:%.*]], ptr [[A:%.*]], i64 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_COMPLEX]], ptr [[B:%.*]], i64 [[TMP1]], i32 0 ; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4 @@ -26,13 +26,13 @@ define void @foo (ptr %A, ptr %B, ptr %Result) { ; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <2 x float> [[TMP14]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP16:%.*]] = fsub <2 x float> [[TMP11]], [[TMP15]] ; CHECK-NEXT: [[TMP17:%.*]] = fadd <2 x float> [[TMP11]], [[TMP15]] -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> -; CHECK-NEXT: [[TMP20]] = fadd <2 x float> [[TMP2]], [[TMP21]] -; CHECK-NEXT: [[TMP18]] = add nuw nsw i64 [[TMP1]], 1 -; CHECK-NEXT: [[TMP19:%.*]] = icmp eq i64 [[TMP18]], [[TMP0]] -; CHECK-NEXT: br i1 [[TMP19]], label [[EXIT:%.*]], label [[LOOP]] +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <2 x float> [[TMP16]], <2 x float> [[TMP17]], <2 x i32> +; CHECK-NEXT: [[TMP19]] = fadd <2 x float> [[TMP2]], [[TMP18]] +; CHECK-NEXT: [[TMP20]] = add nuw nsw i64 [[TMP1]], 1 +; CHECK-NEXT: [[TMP21:%.*]] = icmp eq i64 [[TMP20]], [[TMP0]] +; CHECK-NEXT: br i1 [[TMP21]], label [[EXIT:%.*]], label [[LOOP]] ; CHECK: exit: -; CHECK-NEXT: store <2 x float> [[TMP20]], ptr [[RESULT:%.*]], align 4 +; CHECK-NEXT: store <2 x float> [[TMP19]], ptr [[RESULT:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll index 82085ade519e2..360b258f216c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/scatter-vectorize-reorder.ll @@ -12,17 +12,17 @@ define void @test() { ; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x float> zeroinitializer, [[TMP0]] ; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX10_I_I86]], align 4 ; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr undef, align 4 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x float> [[TMP5]], float [[TMP3]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> -; CHECK-NEXT: [[TMP8:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP6]], <2 x float> [[TMP7]]) +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x float> , float [[TMP2]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> [[TMP0]], float [[TMP3]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x float> [[TMP4]], <2 x float> , <2 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[TMP4]], <2 x float> [[TMP5]], <2 x float> [[TMP6]]) ; CHECK-NEXT: br i1 false, label [[BB2:%.*]], label [[BB3:%.*]] ; CHECK: bb2: -; CHECK-NEXT: [[TMP9:%.*]] = fmul <2 x float> [[TMP8]], zeroinitializer +; CHECK-NEXT: [[TMP8:%.*]] = fmul <2 x float> [[TMP7]], zeroinitializer ; CHECK-NEXT: br label [[BB3]] ; CHECK: bb3: -; CHECK-NEXT: [[TMP10:%.*]] = phi <2 x float> [ [[TMP9]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] +; CHECK-NEXT: [[TMP9:%.*]] = phi <2 x float> [ [[TMP8]], [[BB2]] ], [ zeroinitializer, [[BB1]] ] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x float> [[TMP9]], <2 x float> poison, <2 x i32> ; CHECK-NEXT: [[TMP11:%.*]] = fadd <2 x float> [[TMP1]], [[TMP10]] ; CHECK-NEXT: [[TMP12:%.*]] = fadd <2 x float> [[TMP11]], zeroinitializer ; CHECK-NEXT: [[TMP13:%.*]] = fsub <2 x float> [[TMP12]], zeroinitializer diff --git a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll index 9810d50beea73..8497493e0069c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/sin-sqrt.ll @@ -11,82 +11,58 @@ declare double @llvm.sin.f64(double) define void @test() { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 -; CHECK-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 -; CHECK-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 -; CHECK-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 -; CHECK-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; CHECK-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 -; CHECK-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 -; CHECK-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; CHECK-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP5]]) -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 -; CHECK-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 -; CHECK-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = call fast <2 x double> @llvm.sin.v2f64(<2 x double> [[TMP7]]) +; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> +; CHECK-NEXT: [[TMP10:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP9]]) +; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> ; CHECK-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) -; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] -; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; CHECK-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP8]] +; CHECK-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP12]] ; CHECK-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] ; CHECK-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 ; CHECK-NEXT: ret void ; ; VECLIB-LABEL: @test( -; VECLIB-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 -; VECLIB-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 -; VECLIB-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 -; VECLIB-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 -; VECLIB-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; VECLIB-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 -; VECLIB-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 -; VECLIB-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 -; VECLIB-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; VECLIB-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 -; VECLIB-NEXT: [[TMP3:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP2]]) -; VECLIB-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 -; VECLIB-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; VECLIB-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 +; VECLIB-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 +; VECLIB-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; VECLIB-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 +; VECLIB-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> ; VECLIB-NEXT: [[TMP6:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP5]]) -; VECLIB-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; VECLIB-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 -; VECLIB-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; VECLIB-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 -; VECLIB-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; VECLIB-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> +; VECLIB-NEXT: [[TMP8:%.*]] = call fast <2 x double> @__svml_sin2(<2 x double> [[TMP7]]) +; VECLIB-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> +; VECLIB-NEXT: [[TMP10:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP9]]) +; VECLIB-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> ; VECLIB-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) -; VECLIB-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] -; VECLIB-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; VECLIB-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP8]] +; VECLIB-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP12]] ; VECLIB-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] ; VECLIB-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 ; VECLIB-NEXT: ret void ; ; AMDLIBM-LABEL: @test( -; AMDLIBM-NEXT: [[A0:%.*]] = load double, ptr @src, align 8 -; AMDLIBM-NEXT: [[A1:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), align 8 -; AMDLIBM-NEXT: [[A2:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 -; AMDLIBM-NEXT: [[A3:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 3), align 8 -; AMDLIBM-NEXT: [[A4:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 -; AMDLIBM-NEXT: [[A5:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 5), align 8 -; AMDLIBM-NEXT: [[A6:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 -; AMDLIBM-NEXT: [[A7:%.*]] = load double, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 7), align 8 -; AMDLIBM-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A2]], i32 0 -; AMDLIBM-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A6]], i32 1 -; AMDLIBM-NEXT: [[TMP3:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP2]]) -; AMDLIBM-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[A3]], i32 0 -; AMDLIBM-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A7]], i32 1 +; AMDLIBM-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr @src, align 8 +; AMDLIBM-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 2), align 8 +; AMDLIBM-NEXT: [[TMP3:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 4), align 8 +; AMDLIBM-NEXT: [[TMP4:%.*]] = load <2 x double>, ptr getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 6), align 8 +; AMDLIBM-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> ; AMDLIBM-NEXT: [[TMP6:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP5]]) -; AMDLIBM-NEXT: [[TMP7:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; AMDLIBM-NEXT: [[TMP8:%.*]] = insertelement <2 x double> [[TMP7]], double [[A4]], i32 1 -; AMDLIBM-NEXT: [[TMP9:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP8]]) -; AMDLIBM-NEXT: [[TMP10:%.*]] = insertelement <2 x double> poison, double [[A1]], i32 0 -; AMDLIBM-NEXT: [[TMP11:%.*]] = insertelement <2 x double> [[TMP10]], double [[A5]], i32 1 +; AMDLIBM-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP4]], <2 x i32> +; AMDLIBM-NEXT: [[TMP8:%.*]] = call fast <2 x double> @amd_vrd2_sin(<2 x double> [[TMP7]]) +; AMDLIBM-NEXT: [[TMP9:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> +; AMDLIBM-NEXT: [[TMP10:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP9]]) +; AMDLIBM-NEXT: [[TMP11:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> [[TMP3]], <2 x i32> ; AMDLIBM-NEXT: [[TMP12:%.*]] = call fast <2 x double> @llvm.sqrt.v2f64(<2 x double> [[TMP11]]) -; AMDLIBM-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP9]], [[TMP6]] -; AMDLIBM-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP12]] +; AMDLIBM-NEXT: [[TMP13:%.*]] = fadd fast <2 x double> [[TMP10]], [[TMP8]] +; AMDLIBM-NEXT: [[TMP14:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP12]] ; AMDLIBM-NEXT: [[TMP15:%.*]] = fadd fast <2 x double> [[TMP13]], [[TMP14]] ; AMDLIBM-NEXT: store <2 x double> [[TMP15]], ptr @dst, align 8 ; AMDLIBM-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll index 6ca1f8119c1cf..202ec9633712f 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2-unord.ll @@ -7,28 +7,18 @@ define dso_local void @_Z4testP1S(ptr %p) local_unnamed_addr { ; CHECK-LABEL: @_Z4testP1S( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[P:%.*]], i64 0, i32 1, i64 0 -; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 15 -; CHECK-NEXT: [[I1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4 -; CHECK-NEXT: [[ARRAYIDX13:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 6 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX13]], align 4 ; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 4 -; CHECK-NEXT: [[I7:%.*]] = load i32, ptr [[ARRAYIDX20]], align 4 ; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 12 ; CHECK-NEXT: [[ARRAYIDX41:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 14 -; CHECK-NEXT: [[I13:%.*]] = load i32, ptr [[ARRAYIDX41]], align 4 -; CHECK-NEXT: [[ARRAYIDX48:%.*]] = getelementptr inbounds [[STRUCT_S]], ptr [[P]], i64 0, i32 2, i64 5 -; CHECK-NEXT: [[I15:%.*]] = load i32, ptr [[ARRAYIDX48]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[ARRAYIDX27]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4 ; CHECK-NEXT: [[TMP2:%.*]] = load <8 x i32>, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> poison, i32 [[I1]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP0]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[I7]], i32 3 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[I13]], i32 6 -; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[I15]], i32 7 -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP8]], <2 x i32> [[TMP1]], i64 4) -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP2]] -; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[P]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = load <4 x i32>, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP3]], <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.vector.insert.v8i32.v2i32(<8 x i32> [[TMP5]], <2 x i32> [[TMP0]], i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP2]] +; CHECK-NEXT: store <8 x i32> [[TMP7]], ptr [[P]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll index ba83ff096c9ac..9778218df6816 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/split-load8_2_unord_geps.ll @@ -4,20 +4,16 @@ define void @test(ptr noalias %p, ptr noalias %addr, ptr noalias %s) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <8 x ptr> poison, ptr [[ADDR:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE1:%.*]] = shufflevector <8 x ptr> [[TMP0]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE1]], <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP2]], i32 8, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <8 x ptr> poison, ptr [[P:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE2:%.*]] = shufflevector <8 x ptr> [[TMP4]], <8 x ptr> poison, <8 x i32> zeroinitializer -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP3]] -; CHECK-NEXT: [[TMP6:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP5]], i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP7:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP1]], i32 8, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, <8 x ptr> [[SHUFFLE2]], <8 x i32> [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = call <8 x i32> @llvm.masked.gather.v8i32.v8p0(<8 x ptr> [[TMP8]], i32 4, <8 x i1> , <8 x i32> poison) -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[TMP9]], [[TMP6]] -; CHECK-NEXT: store <8 x i32> [[TMP10]], ptr [[S:%.*]], align 4 +; CHECK-NEXT: [[TMP0:%.*]] = load <16 x i32>, ptr [[ADDR:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[TMP0]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x ptr> poison, ptr [[P:%.*]], i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x ptr> [[TMP2]], <16 x ptr> poison, <16 x i32> zeroinitializer +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i32, <16 x ptr> [[TMP3]], <16 x i32> [[TMP1]] +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i32> @llvm.masked.gather.v16i32.v16p0(<16 x ptr> [[TMP4]], i32 4, <16 x i1> , <16 x i32> poison) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <16 x i32> [[TMP5]], <16 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = add nsw <8 x i32> [[TMP6]], [[TMP7]] +; CHECK-NEXT: store <8 x i32> [[TMP8]], ptr [[S:%.*]], align 4 ; CHECK-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll index 87063fc3f7a82..69ae26b9f2585 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/supernode.ll @@ -7,19 +7,13 @@ define void @test_supernode_add(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { ; ENABLED-LABEL: @test_supernode_add( ; ENABLED-NEXT: entry: -; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]] -; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> +; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP1]], [[TMP3]] +; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]] ; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; @@ -54,19 +48,13 @@ entry: define void @test_supernode_addsub(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { ; ENABLED-LABEL: @test_supernode_addsub( ; ENABLED-NEXT: entry: -; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]] -; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> +; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] +; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP4]], [[TMP5]] ; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; @@ -102,22 +90,16 @@ entry: define void @test_supernode_addsub_alt(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Sarray) { ; ENABLED-LABEL: @test_supernode_addsub_alt( ; ENABLED-NEXT: entry: -; ENABLED-NEXT: [[IDXA1:%.*]] = getelementptr inbounds double, ptr [[AARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[IDXC1:%.*]] = getelementptr inbounds double, ptr [[CARRAY:%.*]], i64 1 -; ENABLED-NEXT: [[A0:%.*]] = load double, ptr [[AARRAY]], align 8 -; ENABLED-NEXT: [[A1:%.*]] = load double, ptr [[IDXA1]], align 8 -; ENABLED-NEXT: [[C0:%.*]] = load double, ptr [[CARRAY]], align 8 -; ENABLED-NEXT: [[C1:%.*]] = load double, ptr [[IDXC1]], align 8 -; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[A0]], i32 0 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C1]], i32 1 -; ENABLED-NEXT: [[TMP3:%.*]] = fsub fast <2 x double> [[TMP2]], [[TMP0]] -; ENABLED-NEXT: [[TMP4:%.*]] = fadd fast <2 x double> [[TMP2]], [[TMP0]] -; ENABLED-NEXT: [[TMP5:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP4]], <2 x i32> -; ENABLED-NEXT: [[TMP6:%.*]] = insertelement <2 x double> poison, double [[C0]], i32 0 -; ENABLED-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[TMP6]], double [[A1]], i32 1 -; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP5]], [[TMP7]] -; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP5]], [[TMP7]] +; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr [[CARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[TMP0]], <2 x double> [[TMP2]], <2 x i32> +; ENABLED-NEXT: [[TMP4:%.*]] = fsub fast <2 x double> [[TMP3]], [[TMP1]] +; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP1]] +; ENABLED-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP5]], <2 x i32> +; ENABLED-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP2]], <2 x double> [[TMP0]], <2 x i32> +; ENABLED-NEXT: [[TMP8:%.*]] = fsub fast <2 x double> [[TMP6]], [[TMP7]] +; ENABLED-NEXT: [[TMP9:%.*]] = fadd fast <2 x double> [[TMP6]], [[TMP7]] ; ENABLED-NEXT: [[TMP10:%.*]] = shufflevector <2 x double> [[TMP8]], <2 x double> [[TMP9]], <2 x i32> ; ENABLED-NEXT: store <2 x double> [[TMP10]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void @@ -177,19 +159,15 @@ entry: define void @supernode_scheduling(ptr %Aarray, ptr %Barray, ptr %Carray, ptr %Darray, ptr %Sarray) { ; ENABLED-LABEL: @supernode_scheduling( ; ENABLED-NEXT: entry: -; ENABLED-NEXT: [[IDXB1:%.*]] = getelementptr inbounds double, ptr [[BARRAY:%.*]], i64 1 ; ENABLED-NEXT: [[C:%.*]] = load double, ptr [[CARRAY:%.*]], align 8 -; ENABLED-NEXT: [[B0:%.*]] = load double, ptr [[BARRAY]], align 8 -; ENABLED-NEXT: [[B1:%.*]] = load double, ptr [[IDXB1]], align 8 ; ENABLED-NEXT: [[D:%.*]] = load double, ptr [[DARRAY:%.*]], align 8 ; ENABLED-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[AARRAY:%.*]], align 8 -; ENABLED-NEXT: [[TMP1:%.*]] = insertelement <2 x double> poison, double [[C]], i32 0 -; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[B1]], i32 1 +; ENABLED-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr [[BARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[C]], i32 0 ; ENABLED-NEXT: [[TMP3:%.*]] = fadd fast <2 x double> [[TMP0]], [[TMP2]] -; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> poison, double [[B0]], i32 0 -; ENABLED-NEXT: [[TMP5:%.*]] = insertelement <2 x double> [[TMP4]], double [[D]], i32 1 -; ENABLED-NEXT: [[TMP6:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP5]] -; ENABLED-NEXT: store <2 x double> [[TMP6]], ptr [[SARRAY:%.*]], align 8 +; ENABLED-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP1]], double [[D]], i32 1 +; ENABLED-NEXT: [[TMP5:%.*]] = fadd fast <2 x double> [[TMP3]], [[TMP4]] +; ENABLED-NEXT: store <2 x double> [[TMP5]], ptr [[SARRAY:%.*]], align 8 ; ENABLED-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll index 243087c6d8d95..fd3c1a57aff34 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec3-calls.ll @@ -5,14 +5,12 @@ define void @vec3_vectorize_call(ptr %Colour, float %0) { ; NON-POW2-LABEL: @vec3_vectorize_call( ; NON-POW2-NEXT: entry: -; NON-POW2-NEXT: [[TMP1:%.*]] = load float, ptr [[COLOUR:%.*]], align 4 -; NON-POW2-NEXT: [[ARRAYIDX91_I:%.*]] = getelementptr float, ptr [[COLOUR]], i64 1 -; NON-POW2-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX91_I]], align 4 -; NON-POW2-NEXT: [[TMP3:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 -; NON-POW2-NEXT: [[TMP4:%.*]] = insertelement <3 x float> [[TMP3]], float [[TMP1]], i32 0 -; NON-POW2-NEXT: [[TMP5:%.*]] = insertelement <3 x float> [[TMP4]], float [[TMP2]], i32 1 -; NON-POW2-NEXT: [[TMP6:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP5]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) -; NON-POW2-NEXT: store <3 x float> [[TMP6]], ptr [[COLOUR]], align 4 +; NON-POW2-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[COLOUR:%.*]], align 4 +; NON-POW2-NEXT: [[TMP2:%.*]] = insertelement <3 x float> poison, float [[TMP0:%.*]], i32 2 +; NON-POW2-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> +; NON-POW2-NEXT: [[TMP4:%.*]] = shufflevector <3 x float> [[TMP2]], <3 x float> [[TMP3]], <3 x i32> +; NON-POW2-NEXT: [[TMP5:%.*]] = call <3 x float> @llvm.fmuladd.v3f32(<3 x float> [[TMP4]], <3 x float> zeroinitializer, <3 x float> zeroinitializer) +; NON-POW2-NEXT: store <3 x float> [[TMP5]], ptr [[COLOUR]], align 4 ; NON-POW2-NEXT: ret void ; ; POW2-ONLY-LABEL: @vec3_vectorize_call( diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index 7bcb2ece77921..f16d19fab2c9d 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -13,11 +13,9 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1 -; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2 -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3 @@ -27,10 +25,6 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -39,18 +33,26 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T34]], i32 6 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], ; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll index 82f8aa5f9be1b..001ab613a6d57 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias.ll @@ -13,11 +13,9 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1 -; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2 -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3 @@ -27,10 +25,6 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -39,18 +33,26 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T27]], i32 2 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T47]], i32 3 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> , i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[T40]], i32 3 -; CHECK-NEXT: [[TMP8:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP9:%.*]] = mul nsw <4 x i32> [[TMP4]], [[TMP7]] -; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <4 x i32> [[TMP8]], <4 x i32> [[TMP9]], <4 x i32> -; CHECK-NEXT: [[TMP11:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP11]], i32 [[T34]], i32 6 +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> +; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[TMP13]], i32 [[T34]], i32 6 ; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], ; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4 ; CHECK-NEXT: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll index 69ecf1852aedd..e52b29a7f681c 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias_external_insert_shuffled.ll @@ -7,11 +7,9 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T4:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 7 ; CHECK-NEXT: [[T5:%.*]] = load i32, ptr [[T4]], align 4 ; CHECK-NEXT: [[T8:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 1 -; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 ; CHECK-NEXT: [[T10:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 6 ; CHECK-NEXT: [[T11:%.*]] = load i32, ptr [[T10]], align 4 ; CHECK-NEXT: [[T14:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 2 -; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T16:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 5 ; CHECK-NEXT: [[T17:%.*]] = load i32, ptr [[T16]], align 4 ; CHECK-NEXT: [[T20:%.*]] = getelementptr inbounds i32, ptr [[T2]], i64 3 @@ -21,11 +19,7 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] -; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] -; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 ; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 -; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -35,11 +29,17 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 ; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[T15]], i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> poison, i32 [[T9]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP3]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP2]], [[TMP4]] +; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 +; CHECK-NEXT: [[T29:%.*]] = sub nsw i32 [[T9]], [[T15]] +; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] +; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 +; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]] ; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> ; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2 ; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3