From 78074c68393d253155a0a486cc8a79d8f530b85f Mon Sep 17 00:00:00 2001 From: Alexey Bataev Date: Fri, 11 Oct 2024 19:55:13 +0000 Subject: [PATCH] =?UTF-8?q?[=F0=9D=98=80=F0=9D=97=BD=F0=9D=97=BF]=20initia?= =?UTF-8?q?l=20version?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Created using spr 1.3.5 --- .../llvm/Analysis/TargetTransformInfo.h | 15 + .../llvm/Analysis/TargetTransformInfoImpl.h | 5 + llvm/lib/Analysis/TargetTransformInfo.cpp | 7 + .../Target/RISCV/RISCVTargetTransformInfo.h | 6 + .../Transforms/Vectorize/SLPVectorizer.cpp | 119 ++++- .../SLPVectorizer/RISCV/complex-loads.ll | 473 ++++++++---------- .../SLPVectorizer/RISCV/segmented-loads.ll | 5 +- 7 files changed, 361 insertions(+), 269 deletions(-) diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h index 64dc9aacd5c57..0459941fe05cd 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfo.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -803,6 +803,12 @@ class TargetTransformInfo { /// Return true if the target supports strided load. bool isLegalStridedLoadStore(Type *DataType, Align Alignment) const; + /// Return true is the target supports interleaved access for the given vector + /// type \p VTy, interleave factor \p Factor, alignment \p Alignment and + /// address space \p AddrSpace. + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, unsigned AddrSpace) const; + // Return true if the target supports masked vector histograms. bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const; @@ -1934,6 +1940,10 @@ class TargetTransformInfo::Concept { virtual bool isLegalMaskedCompressStore(Type *DataType, Align Alignment) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType, Align Alignment) = 0; virtual bool isLegalStridedLoadStore(Type *DataType, Align Alignment) = 0; + virtual bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, + unsigned AddrSpace) = 0; + virtual bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) = 0; virtual bool isLegalAltInstr(VectorType *VecTy, unsigned Opcode0, unsigned Opcode1, @@ -2456,6 +2466,11 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept { bool isLegalStridedLoadStore(Type *DataType, Align Alignment) override { return Impl.isLegalStridedLoadStore(DataType, Alignment); } + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, + unsigned AddrSpace) override { + return Impl.isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace); + } bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) override { return Impl.isLegalMaskedVectorHistogram(AddrType, DataType); } diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h index 1c4fcb57755ec..dbdfb4d8cdfa3 100644 --- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -332,6 +332,11 @@ class TargetTransformInfoImplBase { return false; } + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, unsigned AddrSpace) { + return false; + } + bool isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const { return false; } diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp index 8ab8a53b75311..1bad331467703 100644 --- a/llvm/lib/Analysis/TargetTransformInfo.cpp +++ b/llvm/lib/Analysis/TargetTransformInfo.cpp @@ -521,6 +521,13 @@ bool TargetTransformInfo::isLegalStridedLoadStore(Type *DataType, return TTIImpl->isLegalStridedLoadStore(DataType, Alignment); } +bool TargetTransformInfo::isLegalInterleavedAccessType( + VectorType *VTy, unsigned Factor, Align Alignment, + unsigned AddrSpace) const { + return TTIImpl->isLegalInterleavedAccessType(VTy, Factor, Alignment, + AddrSpace); +} + bool TargetTransformInfo::isLegalMaskedVectorHistogram(Type *AddrType, Type *DataType) const { return TTIImpl->isLegalMaskedVectorHistogram(AddrType, DataType); diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 3f50bd86b9b3b..13d28e4db49cd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -295,6 +295,12 @@ class RISCVTTIImpl : public BasicTTIImplBase { return TLI->isLegalStridedLoadStore(DataTypeVT, Alignment); } + bool isLegalInterleavedAccessType(VectorType *VTy, unsigned Factor, + Align Alignment, unsigned AddrSpace) { + return TLI->isLegalInterleavedAccessType(VTy, Factor, Alignment, AddrSpace, + DL); + } + bool isLegalMaskedCompressStore(Type *DataTy, Align Alignment); bool isVScaleKnownToBeAPowerOfTwo() const { diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index 5c164075e8325..1e8939988037d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -2922,7 +2922,7 @@ class BoUpSLP { /// This is the recursive part of buildTree. void buildTree_rec(ArrayRef Roots, unsigned Depth, - const EdgeInfo &EI); + const EdgeInfo &EI, unsigned InterleaveFactor = 0); /// \returns true if the ExtractElement/ExtractValue instructions in \p VL can /// be vectorized to use the original vector (or aggregate "bitcast" to a @@ -3226,7 +3226,15 @@ class BoUpSLP { Instruction *MainOp = nullptr; Instruction *AltOp = nullptr; + /// Interleaving factor for interleaved loads Vectorize nodes. + unsigned InterleaveFactor = 0; + public: + /// Returns interleave factor for interleave nodes. + unsigned getInterleaveFactor() const { return InterleaveFactor; } + /// Sets interleaving factor for the interleaving nodes. + void setInterleave(unsigned Factor) { InterleaveFactor = Factor; } + /// Set this bundle's \p OpIdx'th operand to \p OpVL. void setOperand(unsigned OpIdx, ArrayRef OpVL) { if (Operands.size() < OpIdx + 1) @@ -3390,7 +3398,12 @@ class BoUpSLP { dbgs() << "State: "; switch (State) { case Vectorize: - dbgs() << "Vectorize\n"; + if (InterleaveFactor > 0) { + dbgs() << "Vectorize with interleave factor " << InterleaveFactor + << "\n"; + } else { + dbgs() << "Vectorize\n"; + } break; case ScatterVectorize: dbgs() << "ScatterVectorize\n"; @@ -3460,11 +3473,15 @@ class BoUpSLP { const InstructionsState &S, const EdgeInfo &UserTreeIdx, ArrayRef ReuseShuffleIndices = {}, - ArrayRef ReorderIndices = {}) { + ArrayRef ReorderIndices = {}, + unsigned InterleaveFactor = 0) { TreeEntry::EntryState EntryState = Bundle ? TreeEntry::Vectorize : TreeEntry::NeedToGather; - return newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, - ReuseShuffleIndices, ReorderIndices); + TreeEntry *E = newTreeEntry(VL, EntryState, Bundle, S, UserTreeIdx, + ReuseShuffleIndices, ReorderIndices); + if (E && InterleaveFactor > 0) + E->setInterleave(InterleaveFactor); + return E; } TreeEntry *newTreeEntry(ArrayRef VL, @@ -6932,11 +6949,16 @@ void BoUpSLP::tryToVectorizeGatheredLoads( // distance between scalar loads in these nodes. unsigned MaxVF = Slice.size(); unsigned UserMaxVF = 0; + unsigned InterleaveFactor = 0; if (MaxVF == 2) { UserMaxVF = MaxVF; } else { + // Found distance between segments of the interleaved loads. + std::optional InterleavedLoadsDistance = 0; + unsigned Order = 0; std::optional CommonVF = 0; DenseMap EntryToPosition; + SmallPtrSet DeinterleavedNodes; for (auto [Idx, V] : enumerate(Slice)) { for (const TreeEntry *E : ValueToGatherNodes.at(V)) { UserMaxVF = std::max(UserMaxVF, E->Scalars.size()); @@ -6951,12 +6973,60 @@ void BoUpSLP::tryToVectorizeGatheredLoads( if (*CommonVF != E->Scalars.size()) CommonVF.reset(); } + // Check if the load is the part of the interleaved load. + if (Pos != Idx && InterleavedLoadsDistance) { + if (!DeinterleavedNodes.contains(E) && + any_of(E->Scalars, [&, Slice = Slice](Value *V) { + if (isa(V)) + return false; + if (getTreeEntry(V)) + return true; + const auto &Nodes = ValueToGatherNodes.at(V); + return (Nodes.size() != 1 || !Nodes.contains(E)) && + !is_contained(Slice, V); + })) { + InterleavedLoadsDistance.reset(); + continue; + } + DeinterleavedNodes.insert(E); + if (*InterleavedLoadsDistance == 0) { + InterleavedLoadsDistance = Idx - Pos; + continue; + } + if ((Idx - Pos) % *InterleavedLoadsDistance != 0 || + (Idx - Pos) / *InterleavedLoadsDistance < Order) + InterleavedLoadsDistance.reset(); + Order = (Idx - Pos) / InterleavedLoadsDistance.value_or(1); + } + } + } + DeinterleavedNodes.clear(); + // Check if the large load represents interleaved load operation. + if (InterleavedLoadsDistance.value_or(0) > 1 && + CommonVF.value_or(0) != 0) { + InterleaveFactor = bit_ceil(*InterleavedLoadsDistance); + unsigned VF = *CommonVF; + OrdersType Order; + SmallVector PointerOps; + // Segmented load detected - vectorize at maximum vector factor. + if (TTI->isLegalInterleavedAccessType( + getWidenedType(Slice.front()->getType(), VF), + InterleaveFactor, + cast(Slice.front())->getAlign(), + cast(Slice.front()) + ->getPointerAddressSpace()) && + canVectorizeLoads(Slice, Slice.front(), Order, + PointerOps) == LoadsState::Vectorize) { + UserMaxVF = InterleaveFactor * VF; + } else { + InterleaveFactor = 0; } } // Cannot represent the loads as consecutive vectorizable nodes - // just exit. unsigned ConsecutiveNodesSize = 0; if (!LoadEntriesToVectorize.empty() && + InterleaveFactor == 0 && any_of(zip(LoadEntriesToVectorize, LoadSetsToVectorize), [&, Slice = Slice](const auto &P) { const auto *It = find_if(Slice, [&](Value *V) { @@ -6976,7 +7046,8 @@ void BoUpSLP::tryToVectorizeGatheredLoads( continue; // Try to build long masked gather loads. UserMaxVF = bit_ceil(UserMaxVF); - if (any_of(seq(Slice.size() / UserMaxVF), + if (InterleaveFactor == 0 && + any_of(seq(Slice.size() / UserMaxVF), [&, Slice = Slice](unsigned Idx) { OrdersType Order; SmallVector PointerOps; @@ -7008,9 +7079,15 @@ void BoUpSLP::tryToVectorizeGatheredLoads( })) continue; unsigned Sz = VectorizableTree.size(); - buildTree_rec(SubSlice, 0, EdgeInfo()); + buildTree_rec(SubSlice, 0, EdgeInfo(), InterleaveFactor); if (Sz == VectorizableTree.size()) { IsVectorized = false; + // Try non-interleaved vectorization with smaller vector + // factor. + if (InterleaveFactor > 0) { + VF = 2 * (MaxVF / InterleaveFactor); + InterleaveFactor = 0; + } continue; } } @@ -7374,6 +7451,11 @@ BoUpSLP::TreeEntry::EntryState BoUpSLP::getScalarsVectorizationState( } return TreeEntry::ScatterVectorize; case LoadsState::StridedVectorize: + if (!IsGraphTransformMode && VectorizableTree.size() > 1) { + // Delay slow vectorized nodes for better vectorization attempts. + LoadEntriesToVectorize.insert(VectorizableTree.size()); + return TreeEntry::NeedToGather; + } return TreeEntry::StridedVectorize; case LoadsState::Gather: #ifndef NDEBUG @@ -7707,7 +7789,8 @@ class PHIHandler { } // namespace void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, - const EdgeInfo &UserTreeIdx) { + const EdgeInfo &UserTreeIdx, + unsigned InterleaveFactor) { assert((allConstant(VL) || allSameType(VL)) && "Invalid types!"); SmallVector ReuseShuffleIndices; @@ -8185,7 +8268,7 @@ void BoUpSLP::buildTree_rec(ArrayRef VL, unsigned Depth, switch (State) { case TreeEntry::Vectorize: TE = newTreeEntry(VL, Bundle /*vectorized*/, S, UserTreeIdx, - ReuseShuffleIndices, CurrentOrder); + ReuseShuffleIndices, CurrentOrder, InterleaveFactor); if (CurrentOrder.empty()) LLVM_DEBUG(dbgs() << "SLP: added a vector of loads.\n"); else @@ -9895,6 +9978,12 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis { Idx = EMask[Idx]; } CommonVF = E->Scalars.size(); + } else if (std::optional Factor = E->getInterleaveFactor(); + Factor && E->Scalars.size() != Mask.size() && + ShuffleVectorInst::isDeInterleaveMaskOfFactor(CommonMask, + *Factor)) { + // Deinterleaved nodes are free. + std::iota(CommonMask.begin(), CommonMask.end(), 0); } ExtraCost += GetNodeMinBWAffectedCost(*E, CommonVF); V1 = Constant::getNullValue(getWidenedType(ScalarTy, CommonVF)); @@ -10968,10 +11057,15 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef VectorizedVals, auto *LI0 = cast(VL0); auto GetVectorCost = [&](InstructionCost CommonCost) { InstructionCost VecLdCost; - if (E->State == TreeEntry::Vectorize) { + if (E->State == TreeEntry::Vectorize && !E->getInterleaveFactor()) { VecLdCost = TTI->getMemoryOpCost( Instruction::Load, VecTy, LI0->getAlign(), LI0->getPointerAddressSpace(), CostKind, TTI::OperandValueInfo()); + } else if (std::optional Factor = E->getInterleaveFactor(); + E->State == TreeEntry::Vectorize && Factor.value_or(0) > 0) { + VecLdCost = TTI->getInterleavedMemoryOpCost( + Instruction::Load, VecTy, *Factor, std::nullopt, LI0->getAlign(), + LI0->getPointerAddressSpace(), CostKind); } else if (E->State == TreeEntry::StridedVectorize) { Align CommonAlignment = computeCommonAlignment(UniqueValues.getArrayRef()); @@ -11397,6 +11491,11 @@ bool BoUpSLP::isTreeTinyAndNotFullyVectorizable(bool ForReduction) const { })) return false; + if (VectorizableTree.back()->isGather() && + VectorizableTree.back()->isAltShuffle() && + VectorizableTree.back()->getVectorFactor() > 2) + return false; + assert(VectorizableTree.empty() ? ExternalUses.empty() : true && "We shouldn't have any external users"); diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 443f17a9c09e7..8c6b92b65ae05 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -11,9 +11,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 ; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1 -; CHECK-NEXT: [[ARRAYIDX25:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5 -; CHECK-NEXT: [[ARRAYIDX27:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5 ; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 ; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 ; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32 @@ -24,59 +21,42 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 ; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; CHECK-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2 -; CHECK-NEXT: [[TMP14:%.*]] = load i8, ptr [[ARRAYIDX32_1]], align 1 -; CHECK-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 6 -; CHECK-NEXT: [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 6 +; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 ; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[TMP6:%.*]] = insertelement <2 x i8> poison, i8 [[TMP4]], i32 0 -; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP5]], i32 1 -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP7]], i32 0 -; CHECK-NEXT: [[TMP9:%.*]] = zext i8 [[TMP8]] to i32 -; CHECK-NEXT: [[ARRAYIDX32_2:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; CHECK-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 3 -; CHECK-NEXT: [[TMP18:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_2]], i64 4, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP18]] to <2 x i16> -; CHECK-NEXT: [[TMP12:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX34_1]], i64 4, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16> -; CHECK-NEXT: [[TMP28:%.*]] = sub <2 x i16> [[TMP20]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP28]], i32 1 -; CHECK-NEXT: [[TMP16:%.*]] = sext i16 [[TMP15]] to i32 -; CHECK-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP28]], i32 0 -; CHECK-NEXT: [[CONV33_1:%.*]] = sext i16 [[TMP17]] to i32 +; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX10_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX13_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 5 -; CHECK-NEXT: [[ARRAYIDX15_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 5 -; CHECK-NEXT: [[TMP19:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR_1]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 ; CHECK-NEXT: [[TMP29:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> ; CHECK-NEXT: [[TMP84:%.*]] = zext i8 [[TMP29]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ADD_PTR64_1]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> ; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]] -; CHECK-NEXT: [[TMP49:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32> -; CHECK-NEXT: [[TMP27:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32> ; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]] ; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], ; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] -; CHECK-NEXT: [[TMP32:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_2]], i64 2, <2 x i1> , i32 2) -; CHECK-NEXT: [[TMP33:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> ; CHECK-NEXT: [[TMP83:%.*]] = zext i8 [[TMP33]] to i32 -; CHECK-NEXT: [[TMP56:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> ; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] -; CHECK-NEXT: [[TMP38:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13_2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32> ; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]] ; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], @@ -86,44 +66,40 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP73:%.*]] = extractelement <2 x i32> [[TMP43]], i32 0 ; CHECK-NEXT: [[TMP34:%.*]] = extractelement <2 x i32> [[TMP43]], i32 1 ; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP34]], [[TMP73]] -; CHECK-NEXT: [[SUB51_2:%.*]] = sub i32 [[TMP73]], [[TMP34]] ; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 ; CHECK-NEXT: [[TMP48:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 ; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP48]], [[TMP47]] -; CHECK-NEXT: [[TMP68:%.*]] = sub i32 [[TMP47]], [[TMP48]] -; CHECK-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1 -; CHECK-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1 -; CHECK-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5 -; CHECK-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1 ; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> ; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> ; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]] -; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) ; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32> -; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP58]], [[TMP76]] +; CHECK-NEXT: [[TMP45:%.*]] = sub <2 x i32> [[TMP60]], [[TMP76]] ; CHECK-NEXT: [[TMP46:%.*]] = shl <2 x i32> [[TMP45]], -; CHECK-NEXT: [[TMP60:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]] -; CHECK-NEXT: [[TMP64:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP90:%.*]] = add <2 x i32> [[TMP46]], [[TMP59]] +; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 +; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 +; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 +; CHECK-NEXT: [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 ; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 ; CHECK-NEXT: [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32> ; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]] ; CHECK-NEXT: [[TMP75:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> , <2 x i8> poison) ; CHECK-NEXT: [[TMP98:%.*]] = zext <2 x i8> [[TMP75]] to <2 x i32> -; CHECK-NEXT: [[TMP100:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP100:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 ; CHECK-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP100]] to <2 x i32> ; CHECK-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP98]], [[TMP103]] ; CHECK-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], -; CHECK-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]] -; CHECK-NEXT: [[TMP90:%.*]] = add <2 x i32> [[TMP72]], [[TMP60]] -; CHECK-NEXT: [[TMP74:%.*]] = sub <2 x i32> [[TMP60]], [[TMP72]] +; CHECK-NEXT: [[TMP74:%.*]] = add <2 x i32> [[TMP70]], [[TMP65]] ; CHECK-NEXT: [[TMP78:%.*]] = extractelement <2 x i32> [[TMP90]], i32 0 ; CHECK-NEXT: [[TMP71:%.*]] = extractelement <2 x i32> [[TMP90]], i32 1 ; CHECK-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP71]], [[TMP78]] @@ -132,46 +108,64 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP74]], i32 1 ; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]] ; CHECK-NEXT: [[TMP107:%.*]] = sub i32 [[TMP80]], [[TMP81]] -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] -; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] +; CHECK-NEXT: [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]] +; CHECK-NEXT: [[TMP113:%.*]] = shufflevector <2 x i32> [[TMP43]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP122:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[ADD48_3]], i32 0 +; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> [[TMP43]], i32 [[ADD55_3]], i32 0 +; CHECK-NEXT: [[TMP123:%.*]] = sub <2 x i32> [[TMP122]], [[TMP72]] +; CHECK-NEXT: [[ADD55_4:%.*]] = add i32 [[TMP107]], [[SUB51_3]] +; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> +; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP126]], i32 [[SUB51_3]], i32 0 +; CHECK-NEXT: [[TMP130:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[TMP107]], i32 0 +; CHECK-NEXT: [[TMP143:%.*]] = sub <2 x i32> [[TMP129]], [[TMP130]] +; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD48_2]] +; CHECK-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_4]] ; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[TMP77]], 15 ; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 ; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 ; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP34]], 15 ; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 ; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] -; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] +; CHECK-NEXT: [[ADD94_5:%.*]] = add i32 [[ADD55_4]], [[ADD55_2]] +; CHECK-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_4]] ; CHECK-NEXT: [[SHR_I_2:%.*]] = lshr i32 [[TMP83]], 15 ; CHECK-NEXT: [[AND_I_2:%.*]] = and i32 [[SHR_I_2]], 65537 ; CHECK-NEXT: [[MUL_I_2:%.*]] = mul i32 [[AND_I_2]], 65535 ; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP84]], 15 ; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 ; CHECK-NEXT: [[ADD94_2:%.*]] = mul i32 [[AND_I50_1]], 65535 -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[SUB51_3]], [[SUB51_2]] +; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP123]], i32 0 +; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP123]], i32 1 +; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP144]], [[TMP145]] +; CHECK-NEXT: [[TMP169:%.*]] = sub i32 [[TMP145]], [[TMP144]] ; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 ; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 ; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP107]], [[TMP68]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP68]], [[TMP107]] +; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 +; CHECK-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 +; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP146]], [[TMP147]] +; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP147]], [[TMP146]] ; CHECK-NEXT: [[SHR_I49_4:%.*]] = lshr i32 [[CONV1]], 15 ; CHECK-NEXT: [[AND_I50_4:%.*]] = and i32 [[SHR_I49_4]], 65537 ; CHECK-NEXT: [[MUL_I51_4:%.*]] = mul i32 [[AND_I50_4]], 65535 ; CHECK-NEXT: [[TMP66:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 ; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP66]] to <2 x i32> -; CHECK-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; CHECK-NEXT: [[TMP67:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP85:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; CHECK-NEXT: [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[TMP1]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP149:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-NEXT: [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> -; CHECK-NEXT: [[TMP109:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP150:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; CHECK-NEXT: [[TMP109:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP89:%.*]] = zext <2 x i8> [[TMP109]] to <2 x i32> ; CHECK-NEXT: [[TMP87:%.*]] = sub <2 x i32> [[TMP108]], [[TMP89]] ; CHECK-NEXT: [[TMP88:%.*]] = shl <2 x i32> [[TMP87]], -; CHECK-NEXT: [[TMP112:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX22]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP112:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP120:%.*]] = zext <2 x i8> [[TMP112]] to <2 x i32> -; CHECK-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX25]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP149]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32> -; CHECK-NEXT: [[TMP131:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX27]], i64 2, <2 x i1> , i32 2) +; CHECK-NEXT: [[TMP131:%.*]] = shufflevector <4 x i8> [[TMP150]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP132:%.*]] = zext <2 x i8> [[TMP131]] to <2 x i32> ; CHECK-NEXT: [[TMP95:%.*]] = sub <2 x i32> [[TMP128]], [[TMP132]] ; CHECK-NEXT: [[TMP96:%.*]] = shl <2 x i32> [[TMP95]], @@ -182,75 +176,70 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP86]], [[TMP85]] ; CHECK-NEXT: [[TMP92:%.*]] = add <2 x i32> [[TMP88]], [[TMP119]] ; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP92]], <2 x i32> -; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]] +; CHECK-NEXT: [[TMP101:%.*]] = add <2 x i32> [[TMP105]], [[TMP92]] +; CHECK-NEXT: [[TMP151:%.*]] = sub <2 x i32> [[TMP92]], [[TMP105]] ; CHECK-NEXT: [[TMP111:%.*]] = extractelement <2 x i32> [[TMP101]], i32 0 ; CHECK-NEXT: [[TMP99:%.*]] = extractelement <2 x i32> [[TMP101]], i32 1 ; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP99]], [[TMP111]] +; CHECK-NEXT: [[SUB51:%.*]] = sub i32 [[TMP111]], [[TMP99]] +; CHECK-NEXT: [[TMP153:%.*]] = extractelement <2 x i32> [[TMP151]], i32 0 +; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP151]], i32 1 +; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[TMP157]], [[TMP153]] +; CHECK-NEXT: [[SUB59:%.*]] = sub i32 [[TMP153]], [[TMP157]] ; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP99]], 15 ; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 ; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 +; CHECK-NEXT: [[SHR_I59_4:%.*]] = lshr i32 [[TMP157]], 15 +; CHECK-NEXT: [[AND_I60_4:%.*]] = and i32 [[SHR_I59_4]], 65537 +; CHECK-NEXT: [[MUL_I61_4:%.*]] = mul i32 [[AND_I60_4]], 65535 ; CHECK-NEXT: [[TMP104:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP113:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; CHECK-NEXT: [[TMP114:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP110:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> +; CHECK-NEXT: [[TMP158:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; CHECK-NEXT: [[TMP114:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP133:%.*]] = zext <2 x i8> [[TMP114]] to <2 x i32> -; CHECK-NEXT: [[TMP116:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP121:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; CHECK-NEXT: [[TMP116:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP115:%.*]] = zext <2 x i8> [[TMP116]] to <2 x i32> -; CHECK-NEXT: [[TMP118:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP159:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; CHECK-NEXT: [[TMP118:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP118]] to <2 x i32> ; CHECK-NEXT: [[TMP124:%.*]] = sub <2 x i32> [[TMP115]], [[TMP134]] ; CHECK-NEXT: [[TMP125:%.*]] = shl <2 x i32> [[TMP124]], -; CHECK-NEXT: [[TMP122:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP123:%.*]] = insertelement <2 x i32> [[TMP122]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP121:%.*]] = sub <2 x i32> [[TMP123]], [[TMP133]] -; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP125]], [[TMP121]] -; CHECK-NEXT: [[TMP126:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP127:%.*]] = insertelement <2 x i8> [[TMP126]], i8 [[TMP14]], i32 1 +; CHECK-NEXT: [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP158]], <4 x i8> poison, <2 x i32> ; CHECK-NEXT: [[TMP191:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> -; CHECK-NEXT: [[TMP129:%.*]] = insertelement <2 x i32> [[TMP113]], i32 [[TMP9]], i32 0 -; CHECK-NEXT: [[TMP130:%.*]] = sub <2 x i32> [[TMP129]], [[TMP191]] -; CHECK-NEXT: [[TMP135:%.*]] = insertelement <2 x i32> [[TMP130]], i32 [[TMP16]], i32 1 +; CHECK-NEXT: [[TMP160:%.*]] = shufflevector <4 x i8> [[TMP121]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP161:%.*]] = zext <2 x i8> [[TMP160]] to <2 x i32> +; CHECK-NEXT: [[TMP171:%.*]] = shufflevector <4 x i8> [[TMP159]], <4 x i8> poison, <2 x i32> +; CHECK-NEXT: [[TMP172:%.*]] = zext <2 x i8> [[TMP171]] to <2 x i32> +; CHECK-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP161]], [[TMP172]] ; CHECK-NEXT: [[TMP136:%.*]] = shl <2 x i32> [[TMP135]], -; CHECK-NEXT: [[TMP110:%.*]] = shufflevector <2 x i32> [[TMP130]], <2 x i32> poison, <2 x i32> ; CHECK-NEXT: [[TMP137:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP155:%.*]] = add <2 x i32> [[TMP136]], [[TMP137]] +; CHECK-NEXT: [[TMP173:%.*]] = sub <2 x i32> [[TMP137]], [[TMP191]] +; CHECK-NEXT: [[TMP174:%.*]] = add <2 x i32> [[TMP136]], [[TMP173]] +; CHECK-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP110]], i32 [[CONV_1]], i32 0 +; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP133]] +; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP125]], [[TMP141]] +; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP174]], [[TMP192]] +; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP192]], [[TMP174]] ; CHECK-NEXT: [[TMP139:%.*]] = extractelement <2 x i32> [[TMP156]], i32 0 ; CHECK-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP156]], i32 1 +; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP142]], [[TMP139]] ; CHECK-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP139]], [[TMP142]] ; CHECK-NEXT: [[TMP138:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0 -; CHECK-NEXT: [[TMP171:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1 -; CHECK-NEXT: [[SUB47_1:%.*]] = sub i32 [[TMP138]], [[TMP171]] -; CHECK-NEXT: [[TMP140:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP105]], <2 x i32> -; CHECK-NEXT: [[TMP153:%.*]] = shufflevector <2 x i32> [[TMP156]], <2 x i32> [[TMP92]], <2 x i32> -; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP140]], [[TMP153]] -; CHECK-NEXT: [[TMP141:%.*]] = shufflevector <2 x i32> [[TMP105]], <2 x i32> [[TMP155]], <2 x i32> -; CHECK-NEXT: [[TMP193:%.*]] = shufflevector <2 x i32> [[TMP92]], <2 x i32> [[TMP155]], <2 x i32> -; CHECK-NEXT: [[TMP143:%.*]] = add <2 x i32> [[TMP141]], [[TMP193]] -; CHECK-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP192]], i32 1 -; CHECK-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP143]], i32 1 -; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP145]], [[TMP144]] -; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP145]], 15 -; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 -; CHECK-NEXT: [[TMP146:%.*]] = extractelement <2 x i32> [[TMP192]], i32 0 -; CHECK-NEXT: [[TMP147:%.*]] = extractelement <2 x i32> [[TMP143]], i32 0 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP147]], [[TMP146]] -; CHECK-NEXT: [[TMP148:%.*]] = sub <2 x i32> [[TMP192]], [[TMP143]] -; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]] -; CHECK-NEXT: [[TMP149:%.*]] = shufflevector <2 x i32> [[TMP101]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP150:%.*]] = insertelement <2 x i32> [[TMP149]], i32 [[SUB45_1]], i32 0 -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP101]], i32 [[SUB47_1]], i32 0 -; CHECK-NEXT: [[TMP152:%.*]] = sub <2 x i32> [[TMP150]], [[TMP151]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP147]], 15 +; CHECK-NEXT: [[SUB47_1:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1 +; CHECK-NEXT: [[ADD94_1:%.*]] = add i32 [[SUB47_1]], [[TMP138]] +; CHECK-NEXT: [[SUB59_1:%.*]] = sub i32 [[TMP138]], [[SUB47_1]] +; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP142]], 15 ; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 ; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 ; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15 ; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 ; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; CHECK-NEXT: [[TMP194:%.*]] = lshr <2 x i32> [[TMP113]], +; CHECK-NEXT: [[TMP194:%.*]] = lshr <2 x i32> [[TMP110]], ; CHECK-NEXT: [[TMP154:%.*]] = and <2 x i32> [[TMP194]], ; CHECK-NEXT: [[TMP195:%.*]] = mul <2 x i32> [[TMP154]], -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] -; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] +; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD55]] +; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD55]], [[ADD48_1]] ; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] ; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] ; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] @@ -260,37 +249,32 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I_1]], [[ADD105]] ; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP34]] ; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP147]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP145]] +; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP142]] +; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61_1]], [[SUB106]] +; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP99]] ; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] ; CHECK-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; CHECK-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]] -; CHECK-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]] ; CHECK-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] ; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] -; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_1]] +; CHECK-NEXT: [[ADD103_2:%.*]] = add i32 [[ADD94_5]], [[ADD103_1]] +; CHECK-NEXT: [[SUB104_2:%.*]] = sub i32 [[ADD103_1]], [[ADD94_5]] +; CHECK-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB104_1]] +; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB104_1]], [[SUB102_1]] +; CHECK-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_2]], [[ADD103_2]] ; CHECK-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP83]] ; CHECK-NEXT: [[ADD_I52_1:%.*]] = add i32 [[ADD94_2]], [[ADD105_1]] ; CHECK-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP84]] -; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] +; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_2]] ; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP99]] +; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_4]], [[SUB106_1]] +; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP157]] ; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] ; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] ; CHECK-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] ; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; CHECK-NEXT: [[TMP196:%.*]] = extractelement <2 x i32> [[TMP148]], i32 0 -; CHECK-NEXT: [[TMP157:%.*]] = extractelement <2 x i32> [[TMP148]], i32 1 -; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[TMP196]], [[TMP157]] -; CHECK-NEXT: [[TMP158:%.*]] = insertelement <2 x i32> [[TMP148]], i32 [[SUB51_2]], i32 0 -; CHECK-NEXT: [[TMP159:%.*]] = shufflevector <2 x i32> [[TMP148]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP160:%.*]] = insertelement <2 x i32> [[TMP159]], i32 [[SUB51_3]], i32 0 -; CHECK-NEXT: [[TMP161:%.*]] = sub <2 x i32> [[TMP158]], [[TMP160]] +; CHECK-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]] +; CHECK-NEXT: [[TMP170:%.*]] = sub i32 [[SUB51]], [[SUB45_1]] ; CHECK-NEXT: [[TMP162:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 ; CHECK-NEXT: [[TMP163:%.*]] = shufflevector <2 x i32> [[TMP162]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP164:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 @@ -298,29 +282,25 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP163]], [[TMP165]] ; CHECK-NEXT: [[TMP167:%.*]] = sub <2 x i32> [[TMP163]], [[TMP165]] ; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP166]], <2 x i32> [[TMP167]], <2 x i32> -; CHECK-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP161]], i32 0 -; CHECK-NEXT: [[TMP170:%.*]] = extractelement <2 x i32> [[TMP161]], i32 1 ; CHECK-NEXT: [[ADD105_2:%.*]] = add i32 [[TMP169]], [[TMP170]] ; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP170]], [[TMP169]] ; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]] ; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] ; CHECK-NEXT: [[TMP197:%.*]] = add <2 x i32> [[TMP195]], [[TMP168]] -; CHECK-NEXT: [[TMP172:%.*]] = xor <2 x i32> [[TMP197]], [[TMP113]] -; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP144]], 15 +; CHECK-NEXT: [[TMP152:%.*]] = xor <2 x i32> [[TMP197]], [[TMP110]] +; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP111]], 15 ; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 ; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 ; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP144]] +; CHECK-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP111]] ; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; CHECK-NEXT: [[TMP173:%.*]] = extractelement <2 x i32> [[TMP172]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP173]] -; CHECK-NEXT: [[TMP174:%.*]] = extractelement <2 x i32> [[TMP172]], i32 1 -; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP174]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] ; CHECK-NEXT: [[TMP175:%.*]] = extractelement <2 x i32> [[TMP152]], i32 0 +; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP175]] ; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP152]], i32 1 -; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP175]], [[TMP176]] -; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP176]], [[TMP175]] +; CHECK-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP176]] +; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] +; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB59_1]], [[SUB59]] +; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB59_1]] ; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 ; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 @@ -358,9 +338,6 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 ; THR15-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; THR15-NEXT: [[ARRAYIDX10:%.*]] = getelementptr i8, ptr [[PIX2]], i64 1 -; THR15-NEXT: [[ARRAYIDX13:%.*]] = getelementptr i8, ptr [[PIX1]], i64 5 -; THR15-NEXT: [[ARRAYIDX15:%.*]] = getelementptr i8, ptr [[PIX2]], i64 5 ; THR15-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 ; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 ; THR15-NEXT: [[CONV33:%.*]] = zext i8 [[TMP1]] to i32 @@ -371,133 +348,116 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 ; THR15-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; THR15-NEXT: [[ARRAYIDX22_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 2 -; THR15-NEXT: [[TMP3:%.*]] = load i8, ptr [[ARRAYIDX22_1]], align 1 -; THR15-NEXT: [[ARRAYIDX25_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 6 -; THR15-NEXT: [[TMP4:%.*]] = load i8, ptr [[ARRAYIDX25_1]], align 1 -; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 6 +; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 ; THR15-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; THR15-NEXT: [[TMP6:%.*]] = insertelement <2 x i8> poison, i8 [[TMP4]], i32 0 -; THR15-NEXT: [[TMP7:%.*]] = insertelement <2 x i8> [[TMP6]], i8 [[TMP5]], i32 1 -; THR15-NEXT: [[TMP8:%.*]] = extractelement <2 x i8> [[TMP7]], i32 0 -; THR15-NEXT: [[TMP9:%.*]] = zext i8 [[TMP8]] to i32 -; THR15-NEXT: [[ARRAYIDX32_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; THR15-NEXT: [[ARRAYIDX34_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 3 -; THR15-NEXT: [[TMP10:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX32_1]], i64 4, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP11:%.*]] = zext <2 x i8> [[TMP10]] to <2 x i16> -; THR15-NEXT: [[TMP12:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX34_1]], i64 4, <2 x i1> , i32 2) -; THR15-NEXT: [[TMP13:%.*]] = zext <2 x i8> [[TMP12]] to <2 x i16> -; THR15-NEXT: [[TMP14:%.*]] = sub <2 x i16> [[TMP11]], [[TMP13]] -; THR15-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP14]], i32 1 -; THR15-NEXT: [[TMP16:%.*]] = sext i16 [[TMP15]] to i32 -; THR15-NEXT: [[SHL42_1:%.*]] = shl i32 [[TMP16]], 16 -; THR15-NEXT: [[TMP17:%.*]] = extractelement <2 x i16> [[TMP14]], i32 0 -; THR15-NEXT: [[TMP18:%.*]] = sext i16 [[TMP17]] to i32 -; THR15-NEXT: [[ADD43_1:%.*]] = add i32 [[SHL42_1]], [[TMP18]] +; THR15-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; THR15-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; THR15-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; THR15-NEXT: [[TMP19:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 +; THR15-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 +; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; THR15-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 +; THR15-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 +; THR15-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; THR15-NEXT: [[TMP21:%.*]] = load <2 x i8>, ptr [[ADD_PTR64_1]], align 1 +; THR15-NEXT: [[TMP87:%.*]] = zext i8 [[TMP6]] to i32 +; THR15-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32> ; THR15-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]] -; THR15-NEXT: [[TMP24:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; THR15-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; THR15-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32> -; THR15-NEXT: [[TMP26:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; THR15-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; THR15-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32> ; THR15-NEXT: [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]] ; THR15-NEXT: [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], -; THR15-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]] -; THR15-NEXT: [[ARRAYIDX20_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 2 -; THR15-NEXT: [[ARRAYIDX22_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 2 -; THR15-NEXT: [[ARRAYIDX25_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 6 -; THR15-NEXT: [[ARRAYIDX27_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 6 -; THR15-NEXT: [[TMP31:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_2]], align 1 +; THR15-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]] +; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32> -; THR15-NEXT: [[TMP33:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_2]], align 1 +; THR15-NEXT: [[TMP86:%.*]] = zext i8 [[TMP7]] to i32 +; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32> ; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]] -; THR15-NEXT: [[TMP36:%.*]] = load <2 x i8>, ptr [[ARRAYIDX25_2]], align 1 +; THR15-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32> -; THR15-NEXT: [[TMP38:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_2]], align 1 +; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> ; THR15-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]] ; THR15-NEXT: [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], -; THR15-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]] +; THR15-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]] +; THR15-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP76]], [[TMP59]] +; THR15-NEXT: [[TMP42:%.*]] = sub <2 x i32> [[TMP59]], [[TMP76]] ; THR15-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0 ; THR15-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1 ; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]] -; THR15-NEXT: [[SUB45_2:%.*]] = sub i32 [[TMP43]], [[TMP44]] ; THR15-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0 ; THR15-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1 ; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]] -; THR15-NEXT: [[SUB47_2:%.*]] = sub i32 [[TMP45]], [[TMP46]] -; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[ADD46_2]], [[ADD44_2]] -; THR15-NEXT: [[ADD55_2:%.*]] = add i32 [[SUB47_2]], [[SUB45_2]] -; THR15-NEXT: [[ARRAYIDX3_3:%.*]] = getelementptr i8, ptr null, i64 4 ; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; THR15-NEXT: [[ARRAYIDX8_3:%.*]] = getelementptr i8, ptr null, i64 1 -; THR15-NEXT: [[ARRAYIDX10_3:%.*]] = getelementptr i8, ptr null, i64 1 -; THR15-NEXT: [[ARRAYIDX15_3:%.*]] = getelementptr i8, ptr null, i64 5 -; THR15-NEXT: [[TMP47:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP47:%.*]] = load <2 x i8>, ptr null, align 1 ; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1 ; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32> ; THR15-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 -; THR15-NEXT: [[TMP50:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP50:%.*]] = load <2 x i8>, ptr null, align 1 ; THR15-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32> ; THR15-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]] -; THR15-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3_3]], i64 -4, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> , i32 2) ; THR15-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; THR15-NEXT: [[TMP55:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5_3]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP54]], <2 x i32> poison, <2 x i32> +; THR15-NEXT: [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 ; THR15-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32> -; THR15-NEXT: [[TMP57:%.*]] = sub <2 x i32> [[TMP54]], [[TMP56]] +; THR15-NEXT: [[TMP57:%.*]] = sub <2 x i32> [[TMP77]], [[TMP56]] ; THR15-NEXT: [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], -; THR15-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]] -; THR15-NEXT: [[TMP60:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX8_3]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]] +; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 +; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 +; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 +; THR15-NEXT: [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 ; THR15-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32> -; THR15-NEXT: [[TMP62:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10_3]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 ; THR15-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32> ; THR15-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]] ; THR15-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> , <2 x i8> poison) ; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> -; THR15-NEXT: [[TMP67:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15_3]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP67:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 ; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> ; THR15-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]] ; THR15-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], -; THR15-NEXT: [[TMP71:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]] -; THR15-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP71]], [[TMP59]] -; THR15-NEXT: [[TMP73:%.*]] = sub <2 x i32> [[TMP59]], [[TMP71]] +; THR15-NEXT: [[TMP73:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]] ; THR15-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 ; THR15-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 ; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]] -; THR15-NEXT: [[TMP76:%.*]] = insertelement <2 x i32> [[TMP72]], i32 [[ADD44_2]], i32 1 -; THR15-NEXT: [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP72]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP78:%.*]] = insertelement <2 x i32> [[TMP77]], i32 [[ADD46_2]], i32 1 -; THR15-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP76]], [[TMP78]] +; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP74]], [[TMP75]] ; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0 ; THR15-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1 ; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]] -; THR15-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP73]], i32 [[SUB45_2]], i32 1 -; THR15-NEXT: [[TMP83:%.*]] = shufflevector <2 x i32> [[TMP73]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP84:%.*]] = insertelement <2 x i32> [[TMP83]], i32 [[SUB47_2]], i32 1 +; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP80]], [[TMP81]] +; THR15-NEXT: [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]] +; THR15-NEXT: [[TMP78:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> +; THR15-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP78]], i32 [[ADD48_3]], i32 0 +; THR15-NEXT: [[TMP83:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[ADD55_3]], i32 0 +; THR15-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP71]], [[TMP83]] +; THR15-NEXT: [[ADD55_4:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] +; THR15-NEXT: [[TMP137:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <2 x i32> +; THR15-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP137]], i32 [[SUB45_3]], i32 0 +; THR15-NEXT: [[TMP84:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[SUB47_3]], i32 0 ; THR15-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]] -; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_3]], [[ADD48_2]] -; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD48_2]], [[ADD48_3]] +; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD44_2]] +; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD44_2]], [[ADD48_4]] ; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15 ; THR15-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 ; THR15-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[ADD46_2]], 15 +; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP44]], 15 ; THR15-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 ; THR15-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; THR15-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_3]], [[ADD55_2]] -; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD55_2]], [[ADD55_3]] -; THR15-NEXT: [[TMP86:%.*]] = extractelement <2 x i32> [[TMP20]], i32 1 +; THR15-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_4]], [[ADD46_2]] +; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD46_2]], [[ADD55_4]] ; THR15-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15 ; THR15-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 ; THR15-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; THR15-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP20]], i32 0 ; THR15-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15 ; THR15-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 ; THR15-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 @@ -517,19 +477,22 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 ; THR15-NEXT: [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 ; THR15-NEXT: [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32> -; THR15-NEXT: [[TMP94:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[PIX2]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 +; THR15-NEXT: [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32> -; THR15-NEXT: [[TMP96:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX3]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 +; THR15-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> -; THR15-NEXT: [[TMP98:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX5]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 +; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32> ; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]] ; THR15-NEXT: [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], -; THR15-NEXT: [[TMP102:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX10]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP102:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32> -; THR15-NEXT: [[TMP104:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX13]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP104:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; THR15-NEXT: [[TMP106:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 [[ARRAYIDX15]], i64 2, <2 x i1> , i32 2) +; THR15-NEXT: [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> ; THR15-NEXT: [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]] ; THR15-NEXT: [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], @@ -549,6 +512,7 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0 ; THR15-NEXT: [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1 ; THR15-NEXT: [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]] +; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[TMP121]], [[TMP122]] ; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15 ; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 ; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 @@ -557,55 +521,52 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 ; THR15-NEXT: [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 ; THR15-NEXT: [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32> -; THR15-NEXT: [[TMP125:%.*]] = load <2 x i8>, ptr [[ADD_PTR644]], align 1 +; THR15-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 +; THR15-NEXT: [[TMP125:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32> -; THR15-NEXT: [[TMP127:%.*]] = load <2 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; THR15-NEXT: [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 +; THR15-NEXT: [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> -; THR15-NEXT: [[TMP129:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; THR15-NEXT: [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 +; THR15-NEXT: [[TMP129:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32> ; THR15-NEXT: [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]] ; THR15-NEXT: [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], -; THR15-NEXT: [[TMP133:%.*]] = shufflevector <2 x i32> [[TMP124]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP134:%.*]] = insertelement <2 x i32> [[TMP133]], i32 [[CONV_1]], i32 0 -; THR15-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP134]], [[TMP126]] -; THR15-NEXT: [[TMP136:%.*]] = add <2 x i32> [[TMP132]], [[TMP135]] -; THR15-NEXT: [[TMP137:%.*]] = shufflevector <2 x i8> [[TMP7]], <2 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP138:%.*]] = insertelement <2 x i8> [[TMP137]], i8 [[TMP3]], i32 1 +; THR15-NEXT: [[TMP138:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> ; THR15-NEXT: [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32> -; THR15-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[TMP9]], i32 0 +; THR15-NEXT: [[TMP154:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32> +; THR15-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> +; THR15-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> +; THR15-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP155]], [[TMP134]] +; THR15-NEXT: [[TMP170:%.*]] = shl <2 x i32> [[TMP135]], +; THR15-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV33_1]], i32 1 ; THR15-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]] -; THR15-NEXT: [[TMP142:%.*]] = extractelement <2 x i32> [[TMP141]], i32 0 -; THR15-NEXT: [[SHL30_1:%.*]] = shl i32 [[TMP142]], 16 -; THR15-NEXT: [[TMP143:%.*]] = extractelement <2 x i32> [[TMP141]], i32 1 -; THR15-NEXT: [[ADD31_1:%.*]] = add i32 [[SHL30_1]], [[TMP143]] +; THR15-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP170]], [[TMP141]] +; THR15-NEXT: [[TMP186:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV_1]], i32 0 +; THR15-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP186]], [[TMP126]] +; THR15-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP132]], [[TMP187]] +; THR15-NEXT: [[TMP136:%.*]] = add <2 x i32> [[TMP171]], [[TMP142]] +; THR15-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP142]], [[TMP171]] ; THR15-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0 ; THR15-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1 +; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP145]], [[TMP144]] ; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]] -; THR15-NEXT: [[TMP146:%.*]] = shufflevector <2 x i32> [[TMP136]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP147:%.*]] = insertelement <2 x i32> [[TMP146]], i32 [[ADD43_1]], i32 1 -; THR15-NEXT: [[TMP148:%.*]] = insertelement <2 x i32> [[TMP136]], i32 [[ADD31_1]], i32 1 -; THR15-NEXT: [[TMP149:%.*]] = add <2 x i32> [[TMP147]], [[TMP148]] -; THR15-NEXT: [[SUB47_1:%.*]] = sub i32 [[ADD31_1]], [[ADD43_1]] ; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0 ; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1 ; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]] ; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]] -; THR15-NEXT: [[ADD55_1:%.*]] = add i32 [[SUB47_1]], [[SUB45_1]] -; THR15-NEXT: [[TMP152:%.*]] = shufflevector <2 x i32> [[TMP118]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP153:%.*]] = insertelement <2 x i32> [[TMP152]], i32 [[SUB45_1]], i32 0 -; THR15-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP118]], i32 [[SUB47_1]], i32 0 -; THR15-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP153]], [[TMP154]] -; THR15-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP151]], 15 +; THR15-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP145]], 15 ; THR15-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 ; THR15-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[SUB47_1]], 15 +; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP151]], 15 ; THR15-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 ; THR15-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 ; THR15-NEXT: [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], ; THR15-NEXT: [[TMP157:%.*]] = and <2 x i32> [[TMP156]], ; THR15-NEXT: [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], -; THR15-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] -; THR15-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] +; THR15-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_2]], [[ADD48]] +; THR15-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_2]] ; THR15-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] ; THR15-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] ; THR15-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] @@ -613,16 +574,16 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] ; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]] ; THR15-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[ADD46_2]] +; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP44]] ; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP151]] +; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP145]] ; THR15-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] ; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]] ; THR15-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] ; THR15-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] ; THR15-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; THR15-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD55_1]], [[ADD55]] -; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD55_1]] +; THR15-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD48_1]], [[ADD55]] +; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD48_1]] ; THR15-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] ; THR15-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] ; THR15-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] @@ -632,15 +593,15 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] ; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]] ; THR15-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] -; THR15-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[SUB47_1]] +; THR15-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP151]] ; THR15-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] ; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]] ; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] ; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] ; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] ; THR15-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; THR15-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB51_1]], [[SUB51]] -; THR15-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB51_1]] +; THR15-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]] +; THR15-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB45_1]] ; THR15-NEXT: [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 ; THR15-NEXT: [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer ; THR15-NEXT: [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 @@ -665,10 +626,8 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; THR15-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1 ; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]] ; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; THR15-NEXT: [[TMP170:%.*]] = extractelement <2 x i32> [[TMP155]], i32 0 -; THR15-NEXT: [[TMP171:%.*]] = extractelement <2 x i32> [[TMP155]], i32 1 -; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[TMP170]], [[TMP171]] -; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[TMP171]], [[TMP170]] +; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[SUB59]] +; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB51_1]] ; THR15-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 ; THR15-NEXT: [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer ; THR15-NEXT: [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll index 54eb564768318..ce26bd3b89392 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/segmented-loads.ll @@ -6,8 +6,9 @@ define void @test() { ; CHECK-LABEL: @test( -; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 @src, i64 16, <4 x i1> , i32 4) -; CHECK-NEXT: [[TMP2:%.*]] = call <4 x double> @llvm.experimental.vp.strided.load.v4f64.p0.i64(ptr align 8 getelementptr inbounds ([8 x double], ptr @src, i32 0, i64 1), i64 16, <4 x i1> , i32 4) +; CHECK-NEXT: [[TMP4:%.*]] = load <8 x double>, ptr @src, align 8 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x double> [[TMP4]], <8 x double> poison, <4 x i32> ; CHECK-NEXT: [[TMP3:%.*]] = fsub fast <4 x double> [[TMP1]], [[TMP2]] ; CHECK-NEXT: store <4 x double> [[TMP3]], ptr @dst, align 8 ; CHECK-NEXT: ret void