diff --git a/llvm/include/llvm/Analysis/VectorUtils.h b/llvm/include/llvm/Analysis/VectorUtils.h index f21594c557e0e..4390b45f1f730 100644 --- a/llvm/include/llvm/Analysis/VectorUtils.h +++ b/llvm/include/llvm/Analysis/VectorUtils.h @@ -203,6 +203,15 @@ bool getShuffleDemandedElts(int SrcWidth, ArrayRef Mask, const APInt &DemandedElts, APInt &DemandedLHS, APInt &DemandedRHS, bool AllowUndefElts = false); +/// Does this shuffle mask represent either one slide shuffle or a pair of +/// two slide shuffles, combined with a select on some constant vector mask? +/// A slide is a shuffle mask which shifts some set of elements up or down +/// the vector, with all other elements being undefined. An identity shuffle +/// will be matched a slide by 0. The output parameter provides the source +/// (-1 means no source), and slide direction for each slide. +bool isMaskedSlidePair(ArrayRef Mask, int NumElts, + std::array, 2> &SrcInfo); + /// Replace each shuffle mask index with the scaled sequential indices for an /// equivalent mask of narrowed elements. Mask elements that are less than 0 /// (sentinel values) are repeated in the output mask. diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index 53be7fc0bee9f..52102d8c74c28 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -413,6 +413,36 @@ bool llvm::getShuffleDemandedElts(int SrcWidth, ArrayRef Mask, return true; } +bool llvm::isMaskedSlidePair(ArrayRef Mask, int NumElts, + std::array, 2> &SrcInfo) { + const int SignalValue = NumElts * 2; + SrcInfo[0] = {-1, SignalValue}; + SrcInfo[1] = {-1, SignalValue}; + for (auto [i, M] : enumerate(Mask)) { + if (M < 0) + continue; + int Src = M >= (int)NumElts; + int Diff = (int)i - (M % NumElts); + bool Match = false; + for (int j = 0; j < 2; j++) { + auto &[SrcE, DiffE] = SrcInfo[j]; + if (SrcE == -1) { + assert(DiffE == SignalValue); + SrcE = Src; + DiffE = Diff; + } + if (SrcE == Src && DiffE == Diff) { + Match = true; + break; + } + } + if (!Match) + return false; + } + assert(SrcInfo[0].first != -1 && "Must find one slide"); + return true; +} + void llvm::narrowShuffleMaskElts(int Scale, ArrayRef Mask, SmallVectorImpl &ScaledMask) { assert(Scale > 0 && "Unexpected scaling factor"); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 6076fe56416ad..71fd3ab28b273 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4562,32 +4562,9 @@ static bool isInterleaveShuffle(ArrayRef Mask, MVT VT, int &EvenSrc, /// Is this mask representing a masked combination of two slides? static bool isMaskedSlidePair(ArrayRef Mask, - std::pair SrcInfo[2]) { - int NumElts = Mask.size(); - int SignalValue = NumElts * 2; - SrcInfo[0] = {-1, SignalValue}; - SrcInfo[1] = {-1, SignalValue}; - for (unsigned i = 0; i != Mask.size(); ++i) { - int M = Mask[i]; - if (M < 0) - continue; - int Src = M >= (int)NumElts; - int Diff = (int)i - (M % NumElts); - bool Match = false; - for (int j = 0; j < 2; j++) { - if (SrcInfo[j].first == -1) { - assert(SrcInfo[j].second == SignalValue); - SrcInfo[j].first = Src; - SrcInfo[j].second = Diff; - } - if (SrcInfo[j].first == Src && SrcInfo[j].second == Diff) { - Match = true; - break; - } - } - if (!Match) - return false; - } + std::array, 2> &SrcInfo) { + if (!llvm::isMaskedSlidePair(Mask, Mask.size(), SrcInfo)) + return false; // Avoid matching vselect idioms if (SrcInfo[0].second == 0 && SrcInfo[1].second == 0) @@ -4603,7 +4580,8 @@ static bool isMaskedSlidePair(ArrayRef Mask, // Exactly matches the semantics of a previously existing custom matcher // to allow migration to new matcher without changing output. -static bool isElementRotate(std::pair SrcInfo[2], unsigned NumElts) { +static bool isElementRotate(std::array, 2> &SrcInfo, + unsigned NumElts) { if (SrcInfo[1].first == -1) return true; return SrcInfo[0].second < 0 && SrcInfo[1].second > 0 && @@ -5604,10 +5582,10 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG, // without masking. Avoid matching bit rotates (which are not also element // rotates) as slide pairs. This is a performance heuristic, not a // functional check. - std::pair SrcInfo[2]; + std::array, 2> SrcInfo; unsigned RotateAmt; MVT RotateVT; - if (isMaskedSlidePair(Mask, SrcInfo) && + if (::isMaskedSlidePair(Mask, SrcInfo) && (isElementRotate(SrcInfo, NumElts) || !isLegalBitRotate(Mask, VT, Subtarget, RotateVT, RotateAmt))) { SDValue Sources[2]; @@ -5964,10 +5942,11 @@ bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef M, EVT VT) const { if (SVT.getScalarType() == MVT::i1) return false; - std::pair SrcInfo[2]; + std::array, 2> SrcInfo; int Dummy1, Dummy2; return ShuffleVectorInst::isReverseMask(M, NumElts) || - (isMaskedSlidePair(M, SrcInfo) && isElementRotate(SrcInfo, NumElts)) || + (::isMaskedSlidePair(M, SrcInfo) && + isElementRotate(SrcInfo, NumElts)) || isInterleaveShuffle(M, SVT, Dummy1, Dummy2, Subtarget); } diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index dfea25e11c0b6..2c4b92ee9e5cd 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -474,6 +474,64 @@ costShuffleViaVRegSplitting(RISCVTTIImpl &TTI, MVT LegalVT, return InstructionCost::getInvalid(); } +InstructionCost RISCVTTIImpl::getSlideCost(FixedVectorType *Tp, + ArrayRef Mask, + TTI::TargetCostKind CostKind) { + // Avoid missing masks and length changing shuffles + if (Mask.size() <= 2 || Mask.size() != Tp->getNumElements()) + return InstructionCost::getInvalid(); + + int NumElts = Tp->getNumElements(); + std::pair LT = getTypeLegalizationCost(Tp); + // Avoid scalarization cases + if (!LT.second.isFixedLengthVector()) + return InstructionCost::getInvalid(); + + // Requires moving elements between parts, which requires additional + // unmodeled instructions. + if (LT.first != 1) + return InstructionCost::getInvalid(); + + auto GetSlideOpcode = [&](int SlideAmt) { + assert(SlideAmt != 0); + bool IsVI = isUInt<5>(std::abs(SlideAmt)); + if (SlideAmt < 0) + return IsVI ? RISCV::VSLIDEDOWN_VI : RISCV::VSLIDEDOWN_VX; + return IsVI ? RISCV::VSLIDEUP_VI : RISCV::VSLIDEUP_VX; + }; + + std::array, 2> SrcInfo; + if (!isMaskedSlidePair(Mask, NumElts, SrcInfo)) + return InstructionCost::getInvalid(); + + if (SrcInfo[1].second == 0) + std::swap(SrcInfo[0], SrcInfo[1]); + + InstructionCost FirstSlideCost = 0; + if (SrcInfo[0].second != 0) { + unsigned Opcode = GetSlideOpcode(SrcInfo[0].second); + FirstSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind); + } + + if (SrcInfo[1].first == -1) + return FirstSlideCost; + + InstructionCost SecondSlideCost = 0; + if (SrcInfo[1].second != 0) { + unsigned Opcode = GetSlideOpcode(SrcInfo[1].second); + SecondSlideCost = getRISCVInstructionCost(Opcode, LT.second, CostKind); + } else { + SecondSlideCost = + getRISCVInstructionCost(RISCV::VMERGE_VVM, LT.second, CostKind); + } + + auto EC = Tp->getElementCount(); + VectorType *MaskTy = + VectorType::get(IntegerType::getInt1Ty(Tp->getContext()), EC); + InstructionCost MaskCost = getConstantPoolLoadCost(MaskTy, CostKind); + return FirstSlideCost + SecondSlideCost + MaskCost; +} + InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, TTI::TargetCostKind CostKind, @@ -487,8 +545,8 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, // First, handle cases where having a fixed length vector enables us to // give a more accurate cost than falling back to generic scalable codegen. // TODO: Each of these cases hints at a modeling gap around scalable vectors. - if (ST->hasVInstructions() && isa(Tp) && - LT.second.isFixedLengthVector()) { + if (auto *FVTp = dyn_cast(Tp); + FVTp && ST->hasVInstructions() && LT.second.isFixedLengthVector()) { InstructionCost VRegSplittingCost = costShuffleViaVRegSplitting( *this, LT.second, ST->getRealVLen(), Tp, Mask, CostKind); if (VRegSplittingCost.isValid()) @@ -544,6 +602,11 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, return Cost; } } + + if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind); + SlideCost.isValid()) + return SlideCost; + // vrgather + cost of generating the mask constant. // We model this for an unknown mask with a single vrgather. if (LT.first == 1 && (LT.second.getScalarSizeInBits() != 8 || @@ -558,6 +621,11 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, } case TTI::SK_Transpose: case TTI::SK_PermuteTwoSrc: { + + if (InstructionCost SlideCost = getSlideCost(FVTp, Mask, CostKind); + SlideCost.isValid()) + return SlideCost; + // 2 x (vrgather + cost of generating the mask constant) + cost of mask // register for the second vrgather. We model this for an unknown // (shuffle) mask. diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h index 134a7333b9b06..3f57560d3c127 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -63,6 +63,12 @@ class RISCVTTIImpl : public BasicTTIImplBase { /// type. InstructionCost getConstantPoolLoadCost(Type *Ty, TTI::TargetCostKind CostKind); + + /// If this shuffle can be lowered as a masked slide pair (at worst), + /// return a cost for it. + InstructionCost getSlideCost(FixedVectorType *Tp, ArrayRef Mask, + TTI::TargetCostKind CostKind); + public: explicit RISCVTTIImpl(const RISCVTargetMachine *TM, const Function &F) : BaseT(TM, F.getDataLayout()), ST(TM->getSubtargetImpl(F)), diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll index c951184a31731..06c709e4cc879 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-exact-vlen.ll @@ -186,7 +186,7 @@ define void @insert_subvec() vscale_range(2,2) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_1 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_3 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_4_05 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16i32_4_05 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; CHECK-SIZE-LABEL: 'insert_subvec' @@ -225,7 +225,7 @@ define void @insert_subvec() vscale_range(2,2) { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_1 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %v16i32_4_3 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16i32_4_05 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v16i32_4_05 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void ; %v4i8_2_0 = shufflevector <4 x i8> poison, <4 x i8> poison, <4 x i32> @@ -737,8 +737,8 @@ define void @multipart() vscale_range(2,2) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32idrev = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 47 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> @@ -757,8 +757,8 @@ define void @multipart() vscale_range(2,2) { ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v16b = shufflevector <8 x i16> poison, <8 x i16> poison, <8 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16c = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v16d = shufflevector <16 x i16> poison, <16 x i16> poison, <16 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> -; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %v32a = shufflevector <4 x i32> poison, <4 x i32> poison, <4 x i32> +; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %v32a4 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v32idrev = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32many = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> ; CHECK-SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %v32many2 = shufflevector <16 x i32> poison, <16 x i32> poison, <16 x i32> diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll index e8dd30345cc76..d2bfb61a11b00 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-extract_subvector.ll @@ -19,7 +19,7 @@ define void @test_vXf64(<4 x double> %src256, <8 x double> %src512) { ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_2345 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> -; CHECK-NEXT: Cost Model: Found an estimated cost of 19 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V512_567u = shufflevector <8 x double> %src512, <8 x double> undef, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; VLEN128-LABEL: 'test_vXf64' diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll index 8f784a07d3124..ef069fee8526e 100644 --- a/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll +++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-transpose.ll @@ -10,11 +10,11 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: 'trn1.v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %tmp0 ; ; SIZE-LABEL: 'trn1.v8i8' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %tmp0 ; %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> @@ -23,11 +23,11 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) { define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { ; CHECK-LABEL: 'trn2.v8i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i8> %tmp0 ; ; SIZE-LABEL: 'trn2.v8i8' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i8> %tmp0 ; %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> @@ -36,11 +36,11 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) { define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: 'trn1.v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %tmp0 ; ; SIZE-LABEL: 'trn1.v16i8' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %tmp0 ; %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> @@ -49,11 +49,11 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) { define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { ; CHECK-LABEL: 'trn2.v16i8' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %tmp0 ; ; SIZE-LABEL: 'trn2.v16i8' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %tmp0 ; %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> @@ -62,11 +62,11 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) { define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: 'trn1.v4i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %tmp0 ; ; SIZE-LABEL: 'trn1.v4i16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %tmp0 ; %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> @@ -75,11 +75,11 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) { define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { ; CHECK-LABEL: 'trn2.v4i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i16> %tmp0 ; ; SIZE-LABEL: 'trn2.v4i16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i16> %tmp0 ; %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> @@ -88,11 +88,11 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) { define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: 'trn1.v8i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %tmp0 ; ; SIZE-LABEL: 'trn1.v8i16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %tmp0 ; %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> @@ -101,11 +101,11 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) { define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) { ; CHECK-LABEL: 'trn2.v8i16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %tmp0 ; ; SIZE-LABEL: 'trn2.v8i16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %tmp0 ; %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> @@ -140,11 +140,11 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) { define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: 'trn1.v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %tmp0 ; ; SIZE-LABEL: 'trn1.v4i32' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %tmp0 ; %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> @@ -153,11 +153,11 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) { define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) { ; CHECK-LABEL: 'trn2.v4i32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %tmp0 ; ; SIZE-LABEL: 'trn2.v4i32' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %tmp0 ; %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> @@ -218,11 +218,11 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) { define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: 'trn1.v4f32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %tmp0 ; ; SIZE-LABEL: 'trn1.v4f32' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %tmp0 ; %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> @@ -231,11 +231,11 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) { define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) { ; CHECK-LABEL: 'trn2.v4f32' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x float> %tmp0 ; ; SIZE-LABEL: 'trn2.v4f32' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x float> %tmp0 ; %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> @@ -270,11 +270,11 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) { define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: 'trn1.v4f16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %tmp0 ; ; SIZE-LABEL: 'trn1.v4f16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x half> %tmp0 ; %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> @@ -283,11 +283,11 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) { define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { ; CHECK-LABEL: 'trn2.v4f16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <4 x half> %tmp0 ; ; SIZE-LABEL: 'trn2.v4f16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <4 x half> %tmp0 ; %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> @@ -296,11 +296,11 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) { define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: 'trn1.v8f16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %tmp0 ; ; SIZE-LABEL: 'trn1.v8f16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x half> %tmp0 ; %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> @@ -309,11 +309,11 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) { define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) { ; CHECK-LABEL: 'trn2.v8f16' -; CHECK-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret <8 x half> %tmp0 ; ; SIZE-LABEL: 'trn2.v8f16' -; SIZE-NEXT: Cost Model: Found an estimated cost of 11 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> +; SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> ; SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret <8 x half> %tmp0 ; %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll index 11fa3337544a1..18acae5835724 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/complex-loads.ll @@ -6,663 +6,175 @@ define i32 @test(ptr %pix1, ptr %pix2, i64 %idx.ext, i64 %idx.ext63, ptr %add.pt ; CHECK-LABEL: define i32 @test( ; CHECK-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1 -; CHECK-NEXT: [[CONV1:%.*]] = zext i8 [[TMP0]] to i32 ; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; CHECK-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 -; CHECK-NEXT: [[TMP10:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 -; CHECK-NEXT: [[CONV33:%.*]] = zext i8 [[TMP10]] to i32 ; CHECK-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; CHECK-NEXT: [[TMP11:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 -; CHECK-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP11]] to i32 ; CHECK-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 -; CHECK-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; CHECK-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; CHECK-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; CHECK-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; CHECK-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; CHECK-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; CHECK-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; CHECK-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP21:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP31:%.*]] = zext <2 x i8> [[TMP22]] to <2 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP21]], [[TMP31]] -; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; CHECK-NEXT: [[TMP49:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP26:%.*]] = zext <2 x i8> [[TMP49]] to <2 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP50:%.*]] = zext <2 x i8> [[TMP27]] to <2 x i32> -; CHECK-NEXT: [[TMP24:%.*]] = sub <2 x i32> [[TMP26]], [[TMP50]] -; CHECK-NEXT: [[TMP25:%.*]] = shl <2 x i32> [[TMP24]], splat (i32 16) -; CHECK-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP25]], [[TMP23]] -; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP32]] to <2 x i32> -; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP57:%.*]] = zext <2 x i8> [[TMP56]] to <2 x i32> -; CHECK-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP51]], [[TMP57]] -; CHECK-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; CHECK-NEXT: [[TMP40:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP40]] to <2 x i32> -; CHECK-NEXT: [[TMP36:%.*]] = sub <2 x i32> [[TMP39]], [[TMP61]] -; CHECK-NEXT: [[TMP37:%.*]] = shl <2 x i32> [[TMP36]], splat (i32 16) -; CHECK-NEXT: [[TMP42:%.*]] = add <2 x i32> [[TMP37]], [[TMP35]] -; CHECK-NEXT: [[TMP34:%.*]] = add <2 x i32> [[TMP42]], [[TMP30]] -; CHECK-NEXT: [[TMP44:%.*]] = sub <2 x i32> [[TMP30]], [[TMP42]] -; CHECK-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP34]], i32 0 -; CHECK-NEXT: [[CONV_2:%.*]] = extractelement <2 x i32> [[TMP34]], i32 1 -; CHECK-NEXT: [[ADD48_2:%.*]] = add i32 [[CONV_2]], [[TMP43]] -; CHECK-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP44]], i32 0 -; CHECK-NEXT: [[TMP47:%.*]] = extractelement <2 x i32> [[TMP44]], i32 1 -; CHECK-NEXT: [[ADD55_2:%.*]] = add i32 [[TMP47]], [[TMP46]] ; CHECK-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; CHECK-NEXT: [[TMP53:%.*]] = load <2 x i8>, ptr null, align 1 ; CHECK-NEXT: [[TMP52:%.*]] = load i8, ptr null, align 1 -; CHECK-NEXT: [[TMP62:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; CHECK-NEXT: [[TMP77:%.*]] = zext i8 [[TMP52]] to i32 -; CHECK-NEXT: [[TMP54:%.*]] = load <2 x i8>, ptr null, align 1 -; CHECK-NEXT: [[TMP55:%.*]] = zext <2 x i8> [[TMP54]] to <2 x i32> -; CHECK-NEXT: [[TMP59:%.*]] = sub <2 x i32> [[TMP62]], [[TMP55]] -; CHECK-NEXT: [[TMP41:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; CHECK-NEXT: [[TMP58:%.*]] = zext <2 x i8> [[TMP41]] to <2 x i32> -; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <2 x i32> [[TMP58]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP63:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; CHECK-NEXT: [[TMP76:%.*]] = zext <2 x i8> [[TMP63]] to <2 x i32> -; CHECK-NEXT: [[TMP81:%.*]] = sub <2 x i32> [[TMP48]], [[TMP76]] -; CHECK-NEXT: [[TMP167:%.*]] = shl <2 x i32> [[TMP81]], splat (i32 16) -; CHECK-NEXT: [[TMP75:%.*]] = add <2 x i32> [[TMP167]], [[TMP59]] -; CHECK-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; CHECK-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 -; CHECK-NEXT: [[TMP64:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 -; CHECK-NEXT: [[TMP79:%.*]] = zext <2 x i8> [[TMP64]] to <2 x i32> -; CHECK-NEXT: [[TMP82:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 -; CHECK-NEXT: [[TMP91:%.*]] = zext <2 x i8> [[TMP82]] to <2 x i32> -; CHECK-NEXT: [[TMP65:%.*]] = sub <2 x i32> [[TMP79]], [[TMP91]] -; CHECK-NEXT: [[TMP170:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; CHECK-NEXT: [[TMP171:%.*]] = zext <2 x i8> [[TMP170]] to <2 x i32> -; CHECK-NEXT: [[TMP172:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; CHECK-NEXT: [[TMP173:%.*]] = zext <2 x i8> [[TMP172]] to <2 x i32> -; CHECK-NEXT: [[TMP66:%.*]] = sub <2 x i32> [[TMP171]], [[TMP173]] -; CHECK-NEXT: [[TMP67:%.*]] = shl <2 x i32> [[TMP66]], splat (i32 16) -; CHECK-NEXT: [[TMP69:%.*]] = add <2 x i32> [[TMP67]], [[TMP65]] -; CHECK-NEXT: [[TMP176:%.*]] = extractelement <2 x i32> [[TMP75]], i32 0 -; CHECK-NEXT: [[TMP197:%.*]] = extractelement <2 x i32> [[TMP75]], i32 1 -; CHECK-NEXT: [[SUB59:%.*]] = add i32 [[TMP197]], [[TMP176]] -; CHECK-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP176]], [[TMP197]] -; CHECK-NEXT: [[ADD112_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 0 -; CHECK-NEXT: [[XOR_I63_2:%.*]] = extractelement <2 x i32> [[TMP69]], i32 1 -; CHECK-NEXT: [[SUB59_1:%.*]] = add i32 [[XOR_I63_2]], [[ADD112_2]] -; CHECK-NEXT: [[SUB47_3:%.*]] = sub i32 [[ADD112_2]], [[XOR_I63_2]] -; CHECK-NEXT: [[ADD94:%.*]] = add i32 [[SUB59_1]], [[SUB59]] -; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <2 x i32> [[TMP34]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP70]], i32 [[SUB59]], i32 0 -; CHECK-NEXT: [[TMP72:%.*]] = insertelement <2 x i32> [[TMP34]], i32 [[SUB59_1]], i32 0 -; CHECK-NEXT: [[TMP222:%.*]] = sub <2 x i32> [[TMP71]], [[TMP72]] -; CHECK-NEXT: [[ADD55_3:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] -; CHECK-NEXT: [[TMP74:%.*]] = shufflevector <2 x i32> [[TMP44]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP78:%.*]] = insertelement <2 x i32> [[TMP74]], i32 [[SUB45_3]], i32 0 -; CHECK-NEXT: [[TMP80:%.*]] = insertelement <2 x i32> [[TMP44]], i32 [[SUB47_3]], i32 0 -; CHECK-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP78]], [[TMP80]] -; CHECK-NEXT: [[ADD95:%.*]] = add i32 [[ADD94]], [[ADD48_2]] -; CHECK-NEXT: [[SUB86_3:%.*]] = sub i32 [[ADD48_2]], [[ADD94]] -; CHECK-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP77]], 15 -; CHECK-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; CHECK-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; CHECK-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[CONV_2]], 15 -; CHECK-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; CHECK-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 -; CHECK-NEXT: [[TMP86:%.*]] = extractelement <2 x i32> [[TMP222]], i32 0 -; CHECK-NEXT: [[TMP87:%.*]] = extractelement <2 x i32> [[TMP222]], i32 1 -; CHECK-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP86]], [[TMP87]] -; CHECK-NEXT: [[ADD112_1:%.*]] = sub i32 [[TMP87]], [[TMP86]] -; CHECK-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 -; CHECK-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 -; CHECK-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; CHECK-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 -; CHECK-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 -; CHECK-NEXT: [[ADD94_4:%.*]] = add i32 [[TMP88]], [[TMP89]] -; CHECK-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP89]], [[TMP88]] -; CHECK-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV1]], 15 -; CHECK-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 -; CHECK-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; CHECK-NEXT: [[TMP90:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; CHECK-NEXT: [[TMP102:%.*]] = zext <2 x i8> [[TMP90]] to <2 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = load i8, ptr null, align 1 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 ; CHECK-NEXT: [[TMP92:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; CHECK-NEXT: [[TMP93:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP94:%.*]] = zext <2 x i8> [[TMP93]] to <2 x i32> ; CHECK-NEXT: [[TMP95:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 -; CHECK-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> ; CHECK-NEXT: [[TMP98:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; CHECK-NEXT: [[TMP99:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP100:%.*]] = zext <2 x i8> [[TMP99]] to <2 x i32> -; CHECK-NEXT: [[TMP101:%.*]] = sub <2 x i32> [[TMP97]], [[TMP100]] -; CHECK-NEXT: [[TMP224:%.*]] = shl <2 x i32> [[TMP101]], splat (i32 16) -; CHECK-NEXT: [[TMP103:%.*]] = shufflevector <4 x i8> [[TMP92]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP104:%.*]] = zext <2 x i8> [[TMP103]] to <2 x i32> -; CHECK-NEXT: [[TMP105:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP106:%.*]] = zext <2 x i8> [[TMP105]] to <2 x i32> -; CHECK-NEXT: [[TMP107:%.*]] = shufflevector <4 x i8> [[TMP98]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP108:%.*]] = zext <2 x i8> [[TMP107]] to <2 x i32> -; CHECK-NEXT: [[TMP109:%.*]] = sub <2 x i32> [[TMP106]], [[TMP108]] -; CHECK-NEXT: [[TMP110:%.*]] = shl <2 x i32> [[TMP109]], splat (i32 16) -; CHECK-NEXT: [[TMP111:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV33]], i32 1 -; CHECK-NEXT: [[TMP112:%.*]] = sub <2 x i32> [[TMP111]], [[TMP104]] -; CHECK-NEXT: [[TMP113:%.*]] = add <2 x i32> [[TMP110]], [[TMP112]] -; CHECK-NEXT: [[TMP114:%.*]] = insertelement <2 x i32> [[TMP102]], i32 [[CONV1]], i32 0 -; CHECK-NEXT: [[TMP115:%.*]] = sub <2 x i32> [[TMP114]], [[TMP94]] -; CHECK-NEXT: [[TMP116:%.*]] = add <2 x i32> [[TMP224]], [[TMP115]] -; CHECK-NEXT: [[TMP117:%.*]] = shufflevector <2 x i32> [[TMP113]], <2 x i32> [[TMP116]], <2 x i32> -; CHECK-NEXT: [[TMP126:%.*]] = add <2 x i32> [[TMP113]], [[TMP116]] -; CHECK-NEXT: [[TMP119:%.*]] = sub <2 x i32> [[TMP116]], [[TMP113]] -; CHECK-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP126]], i32 0 -; CHECK-NEXT: [[TMP127:%.*]] = extractelement <2 x i32> [[TMP126]], i32 1 -; CHECK-NEXT: [[ADD48:%.*]] = add i32 [[TMP127]], [[TMP120]] -; CHECK-NEXT: [[TMP166:%.*]] = sub i32 [[TMP120]], [[TMP127]] -; CHECK-NEXT: [[TMP128:%.*]] = extractelement <2 x i32> [[TMP119]], i32 0 -; CHECK-NEXT: [[TMP129:%.*]] = extractelement <2 x i32> [[TMP119]], i32 1 -; CHECK-NEXT: [[ADD55:%.*]] = add i32 [[TMP129]], [[TMP128]] -; CHECK-NEXT: [[SUB60:%.*]] = sub i32 [[TMP128]], [[TMP129]] -; CHECK-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP127]], 15 -; CHECK-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; CHECK-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 -; CHECK-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP129]], 15 -; CHECK-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 -; CHECK-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; CHECK-NEXT: [[TMP130:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; CHECK-NEXT: [[TMP131:%.*]] = zext <2 x i8> [[TMP130]] to <2 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 ; CHECK-NEXT: [[TMP132:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; CHECK-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> ; CHECK-NEXT: [[TMP135:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; CHECK-NEXT: [[TMP136:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP137:%.*]] = zext <2 x i8> [[TMP136]] to <2 x i32> ; CHECK-NEXT: [[TMP138:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; CHECK-NEXT: [[TMP139:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP140:%.*]] = zext <2 x i8> [[TMP139]] to <2 x i32> -; CHECK-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP137]], [[TMP140]] -; CHECK-NEXT: [[TMP142:%.*]] = shl <2 x i32> [[TMP141]], splat (i32 16) -; CHECK-NEXT: [[TMP143:%.*]] = shufflevector <4 x i8> [[TMP132]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP144:%.*]] = zext <2 x i8> [[TMP143]] to <2 x i32> -; CHECK-NEXT: [[TMP145:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP146:%.*]] = zext <2 x i8> [[TMP145]] to <2 x i32> -; CHECK-NEXT: [[TMP147:%.*]] = shufflevector <4 x i8> [[TMP138]], <4 x i8> poison, <2 x i32> -; CHECK-NEXT: [[TMP148:%.*]] = zext <2 x i8> [[TMP147]] to <2 x i32> -; CHECK-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP146]], [[TMP148]] -; CHECK-NEXT: [[TMP150:%.*]] = shl <2 x i32> [[TMP149]], splat (i32 16) -; CHECK-NEXT: [[TMP151:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV33_1]], i32 1 -; CHECK-NEXT: [[TMP225:%.*]] = sub <2 x i32> [[TMP151]], [[TMP144]] -; CHECK-NEXT: [[TMP153:%.*]] = add <2 x i32> [[TMP150]], [[TMP225]] -; CHECK-NEXT: [[TMP154:%.*]] = insertelement <2 x i32> [[TMP131]], i32 [[CONV_1]], i32 0 -; CHECK-NEXT: [[TMP155:%.*]] = sub <2 x i32> [[TMP154]], [[TMP134]] -; CHECK-NEXT: [[TMP156:%.*]] = add <2 x i32> [[TMP142]], [[TMP155]] -; CHECK-NEXT: [[TMP157:%.*]] = add <2 x i32> [[TMP153]], [[TMP156]] -; CHECK-NEXT: [[TMP158:%.*]] = sub <2 x i32> [[TMP156]], [[TMP153]] -; CHECK-NEXT: [[TMP159:%.*]] = extractelement <2 x i32> [[TMP157]], i32 0 -; CHECK-NEXT: [[TMP160:%.*]] = extractelement <2 x i32> [[TMP157]], i32 1 -; CHECK-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP160]], [[TMP159]] -; CHECK-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP159]], [[TMP160]] -; CHECK-NEXT: [[TMP161:%.*]] = extractelement <2 x i32> [[TMP158]], i32 0 -; CHECK-NEXT: [[TMP162:%.*]] = extractelement <2 x i32> [[TMP158]], i32 1 -; CHECK-NEXT: [[ADD55_1:%.*]] = add i32 [[TMP162]], [[TMP161]] -; CHECK-NEXT: [[SUB59_2:%.*]] = sub i32 [[TMP161]], [[TMP162]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP160]], 15 -; CHECK-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 -; CHECK-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; CHECK-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP162]], 15 -; CHECK-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 -; CHECK-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; CHECK-NEXT: [[TMP163:%.*]] = lshr <2 x i32> [[TMP131]], splat (i32 15) -; CHECK-NEXT: [[TMP164:%.*]] = and <2 x i32> [[TMP163]], splat (i32 65537) -; CHECK-NEXT: [[TMP165:%.*]] = mul <2 x i32> [[TMP164]], splat (i32 65535) -; CHECK-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_1]], [[ADD48]] -; CHECK-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_1]] -; CHECK-NEXT: [[ADD103:%.*]] = add i32 [[ADD95]], [[ADD78]] -; CHECK-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD95]] -; CHECK-NEXT: [[ADD105:%.*]] = add i32 [[SUB86_3]], [[SUB86]] -; CHECK-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB86_3]] -; CHECK-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I_1]], [[ADD103]] -; CHECK-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[TMP77]] -; CHECK-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51_1]], [[ADD105]] -; CHECK-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[CONV_2]] -; CHECK-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; CHECK-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP160]] -; CHECK-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; CHECK-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP127]] -; CHECK-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] -; CHECK-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; CHECK-NEXT: [[ADD105_3:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; CHECK-NEXT: [[TMP169:%.*]] = load <2 x i8>, ptr [[ADD_PTR_1]], align 1 -; CHECK-NEXT: [[TMP181:%.*]] = zext <2 x i8> [[TMP169]] to <2 x i32> -; CHECK-NEXT: [[TMP152:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_2]], i32 0 -; CHECK-NEXT: [[TMP182:%.*]] = shufflevector <2 x i32> [[TMP152]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP183:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_3]], i32 0 -; CHECK-NEXT: [[TMP184:%.*]] = shufflevector <2 x i32> [[TMP183]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP191:%.*]] = sub <2 x i32> [[TMP182]], [[TMP184]] -; CHECK-NEXT: [[TMP192:%.*]] = add <2 x i32> [[TMP182]], [[TMP184]] -; CHECK-NEXT: [[TMP194:%.*]] = shufflevector <2 x i32> [[TMP191]], <2 x i32> [[TMP192]], <2 x i32> -; CHECK-NEXT: [[TMP195:%.*]] = lshr <2 x i32> [[TMP181]], splat (i32 15) -; CHECK-NEXT: [[TMP196:%.*]] = and <2 x i32> [[TMP195]], splat (i32 65537) -; CHECK-NEXT: [[TMP198:%.*]] = mul <2 x i32> [[TMP196]], splat (i32 65535) -; CHECK-NEXT: [[TMP202:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55]], i32 0 -; CHECK-NEXT: [[TMP203:%.*]] = shufflevector <2 x i32> [[TMP202]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP205:%.*]] = insertelement <2 x i32> poison, i32 [[ADD55_1]], i32 0 -; CHECK-NEXT: [[TMP206:%.*]] = shufflevector <2 x i32> [[TMP205]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP207:%.*]] = sub <2 x i32> [[TMP203]], [[TMP206]] -; CHECK-NEXT: [[TMP210:%.*]] = add <2 x i32> [[TMP203]], [[TMP206]] -; CHECK-NEXT: [[TMP168:%.*]] = shufflevector <2 x i32> [[TMP207]], <2 x i32> [[TMP210]], <2 x i32> -; CHECK-NEXT: [[ADD94_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 1 -; CHECK-NEXT: [[ADD78_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 1 -; CHECK-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; CHECK-NEXT: [[TMP220:%.*]] = add <2 x i32> [[TMP194]], [[TMP168]] -; CHECK-NEXT: [[SUB102_1:%.*]] = extractelement <2 x i32> [[TMP194]], i32 0 -; CHECK-NEXT: [[SUB86_1:%.*]] = extractelement <2 x i32> [[TMP168]], i32 0 -; CHECK-NEXT: [[TMP174:%.*]] = shufflevector <2 x i32> [[TMP168]], <2 x i32> [[TMP194]], <2 x i32> -; CHECK-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; CHECK-NEXT: [[TMP175:%.*]] = add <2 x i32> [[TMP198]], [[TMP220]] -; CHECK-NEXT: [[TMP221:%.*]] = xor <2 x i32> [[TMP175]], [[TMP181]] -; CHECK-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] -; CHECK-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP162]] -; CHECK-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; CHECK-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP129]] -; CHECK-NEXT: [[XOR_I53_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 0 -; CHECK-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD105_3]] -; CHECK-NEXT: [[XOR_I_1:%.*]] = extractelement <2 x i32> [[TMP221]], i32 1 -; CHECK-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; CHECK-NEXT: [[ADD112_5:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; CHECK-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_5]], [[XOR_I63_1]] -; CHECK-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[TMP166]] -; CHECK-NEXT: [[TMP204:%.*]] = sub i32 [[TMP166]], [[SUB51_1]] -; CHECK-NEXT: [[TMP177:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 -; CHECK-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP177]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP179:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 -; CHECK-NEXT: [[TMP180:%.*]] = shufflevector <2 x i32> [[TMP179]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP199:%.*]] = add <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP200:%.*]] = sub <2 x i32> [[TMP178]], [[TMP180]] -; CHECK-NEXT: [[TMP201:%.*]] = shufflevector <2 x i32> [[TMP199]], <2 x i32> [[TMP200]], <2 x i32> -; CHECK-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[TMP204]] -; CHECK-NEXT: [[SUB106_2:%.*]] = sub i32 [[TMP204]], [[ADD112_1]] -; CHECK-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD113_1]] -; CHECK-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; CHECK-NEXT: [[TMP208:%.*]] = add <2 x i32> [[TMP165]], [[TMP201]] -; CHECK-NEXT: [[TMP209:%.*]] = xor <2 x i32> [[TMP208]], [[TMP131]] -; CHECK-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP120]], 15 -; CHECK-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; CHECK-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; CHECK-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; CHECK-NEXT: [[XOR_I63_4:%.*]] = xor i32 [[ADD_I62_2]], [[TMP120]] -; CHECK-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_2]] -; CHECK-NEXT: [[TMP211:%.*]] = extractelement <2 x i32> [[TMP209]], i32 0 -; CHECK-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP211]] -; CHECK-NEXT: [[TMP212:%.*]] = extractelement <2 x i32> [[TMP209]], i32 1 -; CHECK-NEXT: [[ADD112_4:%.*]] = add i32 [[ADD110_2]], [[TMP212]] -; CHECK-NEXT: [[ADD113_4:%.*]] = add i32 [[ADD112_4]], [[XOR_I63_4]] -; CHECK-NEXT: [[ADD78_4:%.*]] = add i32 [[SUB59_2]], [[SUB60]] -; CHECK-NEXT: [[SUB86_4:%.*]] = sub i32 [[SUB60]], [[SUB59_2]] -; CHECK-NEXT: [[TMP213:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_4]], i32 0 -; CHECK-NEXT: [[TMP214:%.*]] = shufflevector <2 x i32> [[TMP213]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP215:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_4]], i32 0 -; CHECK-NEXT: [[TMP216:%.*]] = shufflevector <2 x i32> [[TMP215]], <2 x i32> poison, <2 x i32> zeroinitializer -; CHECK-NEXT: [[TMP217:%.*]] = add <2 x i32> [[TMP214]], [[TMP216]] -; CHECK-NEXT: [[TMP218:%.*]] = sub <2 x i32> [[TMP214]], [[TMP216]] -; CHECK-NEXT: [[TMP219:%.*]] = shufflevector <2 x i32> [[TMP217]], <2 x i32> [[TMP218]], <2 x i32> -; CHECK-NEXT: [[ADD105_4:%.*]] = add i32 [[SUB102_3]], [[SUB86_4]] -; CHECK-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_4]], [[SUB102_3]] -; CHECK-NEXT: [[ADD_I52_4:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_4]] -; CHECK-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_4]], [[CONV1]] -; CHECK-NEXT: [[TMP185:%.*]] = lshr <2 x i32> [[TMP102]], splat (i32 15) -; CHECK-NEXT: [[TMP193:%.*]] = and <2 x i32> [[TMP185]], splat (i32 65537) -; CHECK-NEXT: [[TMP186:%.*]] = mul <2 x i32> [[TMP193]], splat (i32 65535) -; CHECK-NEXT: [[TMP187:%.*]] = add <2 x i32> [[TMP186]], [[TMP219]] -; CHECK-NEXT: [[TMP188:%.*]] = xor <2 x i32> [[TMP187]], [[TMP102]] -; CHECK-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 -; CHECK-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 -; CHECK-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 -; CHECK-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] -; CHECK-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; CHECK-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_4]] -; CHECK-NEXT: [[TMP189:%.*]] = extractelement <2 x i32> [[TMP188]], i32 0 -; CHECK-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP189]] -; CHECK-NEXT: [[TMP190:%.*]] = extractelement <2 x i32> [[TMP188]], i32 1 -; CHECK-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP190]] -; CHECK-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] +; CHECK-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; CHECK-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; CHECK-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; CHECK-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; CHECK-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0) +; CHECK-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP14]], i64 4) +; CHECK-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP16]], <4 x i8> [[TMP2]], i64 8) +; CHECK-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12) +; CHECK-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> +; CHECK-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr null, align 1 +; CHECK-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0) +; CHECK-NEXT: [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP20]], i64 4) +; CHECK-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP92]], i64 8) +; CHECK-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP132]], i64 12) +; CHECK-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; CHECK-NEXT: [[TMP26:%.*]] = sub <16 x i32> [[TMP19]], [[TMP25]] +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) +; CHECK-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP135]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP95]], <4 x i8> poison, <16 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP3]], i32 5 +; CHECK-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP52]], i32 9 +; CHECK-NEXT: [[TMP37:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i32> +; CHECK-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; CHECK-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0) +; CHECK-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP38]], i64 4) +; CHECK-NEXT: [[TMP41:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP98]], i64 8) +; CHECK-NEXT: [[TMP42:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]], <4 x i8> [[TMP138]], i64 12) +; CHECK-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> +; CHECK-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = sub <16 x i32> [[TMP37]], [[TMP44]] +; CHECK-NEXT: [[TMP46:%.*]] = shl <16 x i32> [[TMP45]], splat (i32 16) +; CHECK-NEXT: [[TMP47:%.*]] = add <16 x i32> [[TMP46]], [[TMP27]] +; CHECK-NEXT: [[TMP48:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP49:%.*]] = add <16 x i32> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP50:%.*]] = sub <16 x i32> [[TMP47]], [[TMP48]] +; CHECK-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> +; CHECK-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP53:%.*]] = add <16 x i32> [[TMP51]], [[TMP70]] +; CHECK-NEXT: [[TMP54:%.*]] = sub <16 x i32> [[TMP51]], [[TMP70]] +; CHECK-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> +; CHECK-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP57:%.*]] = sub <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP58:%.*]] = add <16 x i32> [[TMP55]], [[TMP56]] +; CHECK-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> +; CHECK-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TMP61:%.*]] = add <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]] +; CHECK-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; CHECK-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> +; CHECK-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15) +; CHECK-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537) +; CHECK-NEXT: [[TMP67:%.*]] = mul <16 x i32> [[TMP66]], splat (i32 65535) +; CHECK-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]] +; CHECK-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP64]] +; CHECK-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) ; CHECK-NEXT: ret i32 [[ADD113_3]] ; ; THR15-LABEL: define i32 @test( ; THR15-SAME: ptr [[PIX1:%.*]], ptr [[PIX2:%.*]], i64 [[IDX_EXT:%.*]], i64 [[IDX_EXT63:%.*]], ptr [[ADD_PTR:%.*]], ptr [[ADD_PTR64:%.*]]) #[[ATTR0:[0-9]+]] { ; THR15-NEXT: entry: -; THR15-NEXT: [[TMP0:%.*]] = load i8, ptr [[PIX1]], align 1 -; THR15-NEXT: [[CONV:%.*]] = zext i8 [[TMP0]] to i32 ; THR15-NEXT: [[ARRAYIDX3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5:%.*]] = getelementptr i8, ptr [[PIX2]], i64 4 -; THR15-NEXT: [[ARRAYIDX8:%.*]] = getelementptr i8, ptr [[PIX1]], i64 1 -; THR15-NEXT: [[ARRAYIDX32:%.*]] = getelementptr i8, ptr [[PIX1]], i64 3 -; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX32]], align 1 -; THR15-NEXT: [[CONV33:%.*]] = zext i8 [[TMP1]] to i32 ; THR15-NEXT: [[ADD_PTR3:%.*]] = getelementptr i8, ptr [[PIX1]], i64 [[IDX_EXT]] ; THR15-NEXT: [[ADD_PTR644:%.*]] = getelementptr i8, ptr [[PIX2]], i64 [[IDX_EXT63]] -; THR15-NEXT: [[TMP2:%.*]] = load i8, ptr [[ADD_PTR3]], align 1 -; THR15-NEXT: [[CONV_1:%.*]] = zext i8 [[TMP2]] to i32 ; THR15-NEXT: [[ARRAYIDX3_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_1:%.*]] = getelementptr i8, ptr [[ADD_PTR644]], i64 4 -; THR15-NEXT: [[ARRAYIDX8_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 1 -; THR15-NEXT: [[ARRAYIDX27_1:%.*]] = getelementptr i8, ptr [[ADD_PTR3]], i64 3 -; THR15-NEXT: [[TMP5:%.*]] = load i8, ptr [[ARRAYIDX27_1]], align 1 -; THR15-NEXT: [[CONV33_1:%.*]] = zext i8 [[TMP5]] to i32 ; THR15-NEXT: [[ADD_PTR_1:%.*]] = getelementptr i8, ptr [[ADD_PTR]], i64 [[IDX_EXT]] ; THR15-NEXT: [[ADD_PTR64_1:%.*]] = getelementptr i8, ptr [[ADD_PTR64]], i64 [[IDX_EXT63]] ; THR15-NEXT: [[ARRAYIDX3_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 4 ; THR15-NEXT: [[ARRAYIDX5_2:%.*]] = getelementptr i8, ptr [[ADD_PTR64_1]], i64 4 -; THR15-NEXT: [[ARRAYIDX8_2:%.*]] = getelementptr i8, ptr [[ADD_PTR_1]], i64 1 -; THR15-NEXT: [[TMP4:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP7:%.*]] = load i8, ptr [[ARRAYIDX8_2]], align 1 -; THR15-NEXT: [[TMP6:%.*]] = load i8, ptr [[ADD_PTR_1]], align 1 -; THR15-NEXT: [[TMP19:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP20:%.*]] = zext <2 x i8> [[TMP19]] to <2 x i32> -; THR15-NEXT: [[TMP87:%.*]] = zext i8 [[TMP6]] to i32 -; THR15-NEXT: [[TMP9:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 -; THR15-NEXT: [[TMP21:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP22:%.*]] = zext <2 x i8> [[TMP21]] to <2 x i32> -; THR15-NEXT: [[TMP23:%.*]] = sub <2 x i32> [[TMP20]], [[TMP22]] -; THR15-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 -; THR15-NEXT: [[TMP24:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP25:%.*]] = zext <2 x i8> [[TMP24]] to <2 x i32> -; THR15-NEXT: [[TMP16:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 -; THR15-NEXT: [[TMP26:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP27:%.*]] = zext <2 x i8> [[TMP26]] to <2 x i32> -; THR15-NEXT: [[TMP28:%.*]] = sub <2 x i32> [[TMP25]], [[TMP27]] -; THR15-NEXT: [[TMP29:%.*]] = shl <2 x i32> [[TMP28]], splat (i32 16) -; THR15-NEXT: [[TMP59:%.*]] = add <2 x i32> [[TMP29]], [[TMP23]] -; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP4]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP32:%.*]] = zext <2 x i8> [[TMP31]] to <2 x i32> -; THR15-NEXT: [[TMP86:%.*]] = zext i8 [[TMP7]] to i32 -; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP9]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP34:%.*]] = zext <2 x i8> [[TMP33]] to <2 x i32> -; THR15-NEXT: [[TMP35:%.*]] = sub <2 x i32> [[TMP32]], [[TMP34]] -; THR15-NEXT: [[TMP36:%.*]] = shufflevector <4 x i8> [[TMP13]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP37:%.*]] = zext <2 x i8> [[TMP36]] to <2 x i32> -; THR15-NEXT: [[TMP38:%.*]] = shufflevector <4 x i8> [[TMP16]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP39:%.*]] = zext <2 x i8> [[TMP38]] to <2 x i32> -; THR15-NEXT: [[TMP40:%.*]] = sub <2 x i32> [[TMP37]], [[TMP39]] -; THR15-NEXT: [[TMP41:%.*]] = shl <2 x i32> [[TMP40]], splat (i32 16) -; THR15-NEXT: [[TMP76:%.*]] = add <2 x i32> [[TMP41]], [[TMP35]] -; THR15-NEXT: [[TMP30:%.*]] = add <2 x i32> [[TMP76]], [[TMP59]] -; THR15-NEXT: [[TMP42:%.*]] = sub <2 x i32> [[TMP59]], [[TMP76]] -; THR15-NEXT: [[TMP43:%.*]] = extractelement <2 x i32> [[TMP30]], i32 0 -; THR15-NEXT: [[TMP44:%.*]] = extractelement <2 x i32> [[TMP30]], i32 1 -; THR15-NEXT: [[ADD44_2:%.*]] = add i32 [[TMP44]], [[TMP43]] -; THR15-NEXT: [[TMP45:%.*]] = extractelement <2 x i32> [[TMP42]], i32 0 -; THR15-NEXT: [[TMP46:%.*]] = extractelement <2 x i32> [[TMP42]], i32 1 -; THR15-NEXT: [[ADD46_2:%.*]] = add i32 [[TMP46]], [[TMP45]] ; THR15-NEXT: [[ARRAYIDX5_3:%.*]] = getelementptr i8, ptr null, i64 4 -; THR15-NEXT: [[TMP47:%.*]] = load <2 x i8>, ptr null, align 1 ; THR15-NEXT: [[TMP48:%.*]] = load i8, ptr null, align 1 -; THR15-NEXT: [[TMP49:%.*]] = zext <2 x i8> [[TMP47]] to <2 x i32> -; THR15-NEXT: [[CONV_3:%.*]] = zext i8 [[TMP48]] to i32 -; THR15-NEXT: [[TMP50:%.*]] = load <2 x i8>, ptr null, align 1 -; THR15-NEXT: [[TMP51:%.*]] = zext <2 x i8> [[TMP50]] to <2 x i32> -; THR15-NEXT: [[TMP52:%.*]] = sub <2 x i32> [[TMP49]], [[TMP51]] -; THR15-NEXT: [[TMP53:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) -; THR15-NEXT: [[TMP54:%.*]] = zext <2 x i8> [[TMP53]] to <2 x i32> -; THR15-NEXT: [[TMP77:%.*]] = shufflevector <2 x i32> [[TMP54]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP55:%.*]] = load <2 x i8>, ptr [[ARRAYIDX5_3]], align 1 -; THR15-NEXT: [[TMP56:%.*]] = zext <2 x i8> [[TMP55]] to <2 x i32> -; THR15-NEXT: [[TMP57:%.*]] = sub <2 x i32> [[TMP77]], [[TMP56]] -; THR15-NEXT: [[TMP58:%.*]] = shl <2 x i32> [[TMP57]], splat (i32 16) -; THR15-NEXT: [[TMP72:%.*]] = add <2 x i32> [[TMP58]], [[TMP52]] -; THR15-NEXT: [[ARRAYIDX20_3:%.*]] = getelementptr i8, ptr null, i64 2 -; THR15-NEXT: [[ARRAYIDX22_3:%.*]] = getelementptr i8, ptr null, i64 2 -; THR15-NEXT: [[ARRAYIDX27_3:%.*]] = getelementptr i8, ptr null, i64 6 -; THR15-NEXT: [[TMP60:%.*]] = load <2 x i8>, ptr [[ARRAYIDX20_3]], align 1 -; THR15-NEXT: [[TMP61:%.*]] = zext <2 x i8> [[TMP60]] to <2 x i32> -; THR15-NEXT: [[TMP62:%.*]] = load <2 x i8>, ptr [[ARRAYIDX22_3]], align 1 -; THR15-NEXT: [[TMP63:%.*]] = zext <2 x i8> [[TMP62]] to <2 x i32> -; THR15-NEXT: [[TMP64:%.*]] = sub <2 x i32> [[TMP61]], [[TMP63]] -; THR15-NEXT: [[TMP65:%.*]] = call <2 x i8> @llvm.masked.gather.v2i8.v2p0(<2 x ptr> zeroinitializer, i32 1, <2 x i1> splat (i1 true), <2 x i8> poison) -; THR15-NEXT: [[TMP66:%.*]] = zext <2 x i8> [[TMP65]] to <2 x i32> -; THR15-NEXT: [[TMP67:%.*]] = load <2 x i8>, ptr [[ARRAYIDX27_3]], align 1 -; THR15-NEXT: [[TMP68:%.*]] = zext <2 x i8> [[TMP67]] to <2 x i32> -; THR15-NEXT: [[TMP69:%.*]] = sub <2 x i32> [[TMP66]], [[TMP68]] -; THR15-NEXT: [[TMP70:%.*]] = shl <2 x i32> [[TMP69]], splat (i32 16) -; THR15-NEXT: [[TMP73:%.*]] = add <2 x i32> [[TMP70]], [[TMP64]] -; THR15-NEXT: [[TMP74:%.*]] = extractelement <2 x i32> [[TMP72]], i32 0 -; THR15-NEXT: [[TMP75:%.*]] = extractelement <2 x i32> [[TMP72]], i32 1 -; THR15-NEXT: [[ADD48_3:%.*]] = add i32 [[TMP75]], [[TMP74]] -; THR15-NEXT: [[SUB45_3:%.*]] = sub i32 [[TMP74]], [[TMP75]] -; THR15-NEXT: [[TMP80:%.*]] = extractelement <2 x i32> [[TMP73]], i32 0 -; THR15-NEXT: [[TMP81:%.*]] = extractelement <2 x i32> [[TMP73]], i32 1 -; THR15-NEXT: [[ADD55_3:%.*]] = add i32 [[TMP81]], [[TMP80]] -; THR15-NEXT: [[SUB47_3:%.*]] = sub i32 [[TMP80]], [[TMP81]] -; THR15-NEXT: [[ADD48_4:%.*]] = add i32 [[ADD55_3]], [[ADD48_3]] -; THR15-NEXT: [[TMP78:%.*]] = shufflevector <2 x i32> [[TMP30]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP71:%.*]] = insertelement <2 x i32> [[TMP78]], i32 [[ADD48_3]], i32 0 -; THR15-NEXT: [[TMP83:%.*]] = insertelement <2 x i32> [[TMP30]], i32 [[ADD55_3]], i32 0 -; THR15-NEXT: [[TMP79:%.*]] = sub <2 x i32> [[TMP71]], [[TMP83]] -; THR15-NEXT: [[ADD55_4:%.*]] = add i32 [[SUB47_3]], [[SUB45_3]] -; THR15-NEXT: [[TMP137:%.*]] = shufflevector <2 x i32> [[TMP42]], <2 x i32> poison, <2 x i32> -; THR15-NEXT: [[TMP82:%.*]] = insertelement <2 x i32> [[TMP137]], i32 [[SUB45_3]], i32 0 -; THR15-NEXT: [[TMP84:%.*]] = insertelement <2 x i32> [[TMP42]], i32 [[SUB47_3]], i32 0 -; THR15-NEXT: [[TMP85:%.*]] = sub <2 x i32> [[TMP82]], [[TMP84]] -; THR15-NEXT: [[ADD94:%.*]] = add i32 [[ADD48_4]], [[ADD44_2]] -; THR15-NEXT: [[SUB102:%.*]] = sub i32 [[ADD44_2]], [[ADD48_4]] -; THR15-NEXT: [[SHR_I:%.*]] = lshr i32 [[CONV_3]], 15 -; THR15-NEXT: [[AND_I:%.*]] = and i32 [[SHR_I]], 65537 -; THR15-NEXT: [[MUL_I:%.*]] = mul i32 [[AND_I]], 65535 -; THR15-NEXT: [[SHR_I49:%.*]] = lshr i32 [[TMP44]], 15 -; THR15-NEXT: [[AND_I50:%.*]] = and i32 [[SHR_I49]], 65537 -; THR15-NEXT: [[MUL_I51:%.*]] = mul i32 [[AND_I50]], 65535 -; THR15-NEXT: [[ADD94_1:%.*]] = add i32 [[ADD55_4]], [[ADD46_2]] -; THR15-NEXT: [[SUB102_1:%.*]] = sub i32 [[ADD46_2]], [[ADD55_4]] -; THR15-NEXT: [[SHR_I_1:%.*]] = lshr i32 [[TMP86]], 15 -; THR15-NEXT: [[AND_I_1:%.*]] = and i32 [[SHR_I_1]], 65537 -; THR15-NEXT: [[MUL_I_1:%.*]] = mul i32 [[AND_I_1]], 65535 -; THR15-NEXT: [[SHR_I49_1:%.*]] = lshr i32 [[TMP87]], 15 -; THR15-NEXT: [[AND_I50_1:%.*]] = and i32 [[SHR_I49_1]], 65537 -; THR15-NEXT: [[MUL_I51_1:%.*]] = mul i32 [[AND_I50_1]], 65535 -; THR15-NEXT: [[TMP88:%.*]] = extractelement <2 x i32> [[TMP79]], i32 0 -; THR15-NEXT: [[TMP89:%.*]] = extractelement <2 x i32> [[TMP79]], i32 1 -; THR15-NEXT: [[ADD94_2:%.*]] = add i32 [[TMP88]], [[TMP89]] -; THR15-NEXT: [[SUB102_2:%.*]] = sub i32 [[TMP89]], [[TMP88]] -; THR15-NEXT: [[SHR_I49_2:%.*]] = lshr i32 [[CONV_1]], 15 -; THR15-NEXT: [[AND_I50_2:%.*]] = and i32 [[SHR_I49_2]], 65537 -; THR15-NEXT: [[MUL_I51_2:%.*]] = mul i32 [[AND_I50_2]], 65535 -; THR15-NEXT: [[TMP90:%.*]] = extractelement <2 x i32> [[TMP85]], i32 0 -; THR15-NEXT: [[TMP91:%.*]] = extractelement <2 x i32> [[TMP85]], i32 1 -; THR15-NEXT: [[ADD94_3:%.*]] = add i32 [[TMP90]], [[TMP91]] -; THR15-NEXT: [[SUB102_3:%.*]] = sub i32 [[TMP91]], [[TMP90]] -; THR15-NEXT: [[SHR_I49_3:%.*]] = lshr i32 [[CONV]], 15 -; THR15-NEXT: [[AND_I50_3:%.*]] = and i32 [[SHR_I49_3]], 65537 -; THR15-NEXT: [[MUL_I51_3:%.*]] = mul i32 [[AND_I50_3]], 65535 -; THR15-NEXT: [[TMP92:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8]], align 1 -; THR15-NEXT: [[TMP93:%.*]] = zext <2 x i8> [[TMP92]] to <2 x i32> +; THR15-NEXT: [[TMP1:%.*]] = load i8, ptr null, align 1 +; THR15-NEXT: [[TMP2:%.*]] = load <4 x i8>, ptr [[PIX1]], align 1 ; THR15-NEXT: [[TMP143:%.*]] = load <4 x i8>, ptr [[PIX2]], align 1 -; THR15-NEXT: [[TMP94:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP95:%.*]] = zext <2 x i8> [[TMP94]] to <2 x i32> ; THR15-NEXT: [[TMP146:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1 -; THR15-NEXT: [[TMP96:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP97:%.*]] = zext <2 x i8> [[TMP96]] to <2 x i32> ; THR15-NEXT: [[TMP147:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5]], align 1 -; THR15-NEXT: [[TMP98:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP99:%.*]] = zext <2 x i8> [[TMP98]] to <2 x i32> -; THR15-NEXT: [[TMP100:%.*]] = sub <2 x i32> [[TMP97]], [[TMP99]] -; THR15-NEXT: [[TMP101:%.*]] = shl <2 x i32> [[TMP100]], splat (i32 16) -; THR15-NEXT: [[TMP102:%.*]] = shufflevector <4 x i8> [[TMP143]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP103:%.*]] = zext <2 x i8> [[TMP102]] to <2 x i32> -; THR15-NEXT: [[TMP104:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP105:%.*]] = zext <2 x i8> [[TMP104]] to <2 x i32> -; THR15-NEXT: [[TMP106:%.*]] = shufflevector <4 x i8> [[TMP147]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP107:%.*]] = zext <2 x i8> [[TMP106]] to <2 x i32> -; THR15-NEXT: [[TMP108:%.*]] = sub <2 x i32> [[TMP105]], [[TMP107]] -; THR15-NEXT: [[TMP109:%.*]] = shl <2 x i32> [[TMP108]], splat (i32 16) -; THR15-NEXT: [[TMP110:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV33]], i32 1 -; THR15-NEXT: [[TMP111:%.*]] = sub <2 x i32> [[TMP110]], [[TMP103]] -; THR15-NEXT: [[TMP112:%.*]] = add <2 x i32> [[TMP109]], [[TMP111]] -; THR15-NEXT: [[TMP113:%.*]] = insertelement <2 x i32> [[TMP93]], i32 [[CONV]], i32 0 -; THR15-NEXT: [[TMP114:%.*]] = sub <2 x i32> [[TMP113]], [[TMP95]] -; THR15-NEXT: [[TMP115:%.*]] = add <2 x i32> [[TMP101]], [[TMP114]] -; THR15-NEXT: [[TMP116:%.*]] = shufflevector <2 x i32> [[TMP112]], <2 x i32> [[TMP115]], <2 x i32> -; THR15-NEXT: [[TMP117:%.*]] = add <2 x i32> [[TMP112]], [[TMP115]] -; THR15-NEXT: [[TMP118:%.*]] = sub <2 x i32> [[TMP115]], [[TMP112]] -; THR15-NEXT: [[TMP119:%.*]] = extractelement <2 x i32> [[TMP117]], i32 0 -; THR15-NEXT: [[TMP120:%.*]] = extractelement <2 x i32> [[TMP117]], i32 1 -; THR15-NEXT: [[ADD48:%.*]] = add i32 [[TMP120]], [[TMP119]] -; THR15-NEXT: [[SUB51:%.*]] = sub i32 [[TMP119]], [[TMP120]] -; THR15-NEXT: [[TMP121:%.*]] = extractelement <2 x i32> [[TMP118]], i32 0 -; THR15-NEXT: [[TMP122:%.*]] = extractelement <2 x i32> [[TMP118]], i32 1 -; THR15-NEXT: [[ADD55:%.*]] = add i32 [[TMP122]], [[TMP121]] -; THR15-NEXT: [[SUB59:%.*]] = sub i32 [[TMP121]], [[TMP122]] -; THR15-NEXT: [[SHR_I59:%.*]] = lshr i32 [[TMP120]], 15 -; THR15-NEXT: [[AND_I60:%.*]] = and i32 [[SHR_I59]], 65537 -; THR15-NEXT: [[MUL_I61:%.*]] = mul i32 [[AND_I60]], 65535 -; THR15-NEXT: [[SHR_I59_1:%.*]] = lshr i32 [[TMP122]], 15 -; THR15-NEXT: [[AND_I60_1:%.*]] = and i32 [[SHR_I59_1]], 65537 -; THR15-NEXT: [[MUL_I61_1:%.*]] = mul i32 [[AND_I60_1]], 65535 -; THR15-NEXT: [[TMP123:%.*]] = load <2 x i8>, ptr [[ARRAYIDX8_1]], align 1 -; THR15-NEXT: [[TMP124:%.*]] = zext <2 x i8> [[TMP123]] to <2 x i32> +; THR15-NEXT: [[TMP6:%.*]] = load <4 x i8>, ptr [[ADD_PTR3]], align 1 ; THR15-NEXT: [[TMP148:%.*]] = load <4 x i8>, ptr [[ADD_PTR644]], align 1 -; THR15-NEXT: [[TMP125:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP126:%.*]] = zext <2 x i8> [[TMP125]] to <2 x i32> ; THR15-NEXT: [[TMP152:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_1]], align 1 -; THR15-NEXT: [[TMP127:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP128:%.*]] = zext <2 x i8> [[TMP127]] to <2 x i32> ; THR15-NEXT: [[TMP153:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_1]], align 1 -; THR15-NEXT: [[TMP129:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP130:%.*]] = zext <2 x i8> [[TMP129]] to <2 x i32> -; THR15-NEXT: [[TMP131:%.*]] = sub <2 x i32> [[TMP128]], [[TMP130]] -; THR15-NEXT: [[TMP132:%.*]] = shl <2 x i32> [[TMP131]], splat (i32 16) -; THR15-NEXT: [[TMP138:%.*]] = shufflevector <4 x i8> [[TMP148]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP139:%.*]] = zext <2 x i8> [[TMP138]] to <2 x i32> -; THR15-NEXT: [[TMP154:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP155:%.*]] = zext <2 x i8> [[TMP154]] to <2 x i32> -; THR15-NEXT: [[TMP133:%.*]] = shufflevector <4 x i8> [[TMP153]], <4 x i8> poison, <2 x i32> -; THR15-NEXT: [[TMP134:%.*]] = zext <2 x i8> [[TMP133]] to <2 x i32> -; THR15-NEXT: [[TMP135:%.*]] = sub <2 x i32> [[TMP155]], [[TMP134]] -; THR15-NEXT: [[TMP170:%.*]] = shl <2 x i32> [[TMP135]], splat (i32 16) -; THR15-NEXT: [[TMP140:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV33_1]], i32 1 -; THR15-NEXT: [[TMP141:%.*]] = sub <2 x i32> [[TMP140]], [[TMP139]] -; THR15-NEXT: [[TMP171:%.*]] = add <2 x i32> [[TMP170]], [[TMP141]] -; THR15-NEXT: [[TMP186:%.*]] = insertelement <2 x i32> [[TMP124]], i32 [[CONV_1]], i32 0 -; THR15-NEXT: [[TMP187:%.*]] = sub <2 x i32> [[TMP186]], [[TMP126]] -; THR15-NEXT: [[TMP142:%.*]] = add <2 x i32> [[TMP132]], [[TMP187]] -; THR15-NEXT: [[TMP136:%.*]] = add <2 x i32> [[TMP171]], [[TMP142]] -; THR15-NEXT: [[TMP149:%.*]] = sub <2 x i32> [[TMP142]], [[TMP171]] -; THR15-NEXT: [[TMP144:%.*]] = extractelement <2 x i32> [[TMP136]], i32 0 -; THR15-NEXT: [[TMP145:%.*]] = extractelement <2 x i32> [[TMP136]], i32 1 -; THR15-NEXT: [[ADD48_2:%.*]] = add i32 [[TMP145]], [[TMP144]] -; THR15-NEXT: [[SUB45_1:%.*]] = sub i32 [[TMP144]], [[TMP145]] -; THR15-NEXT: [[TMP150:%.*]] = extractelement <2 x i32> [[TMP149]], i32 0 -; THR15-NEXT: [[TMP151:%.*]] = extractelement <2 x i32> [[TMP149]], i32 1 -; THR15-NEXT: [[ADD48_1:%.*]] = add i32 [[TMP151]], [[TMP150]] -; THR15-NEXT: [[SUB51_1:%.*]] = sub i32 [[TMP150]], [[TMP151]] -; THR15-NEXT: [[SHR_I54:%.*]] = lshr i32 [[TMP145]], 15 -; THR15-NEXT: [[AND_I55:%.*]] = and i32 [[SHR_I54]], 65537 -; THR15-NEXT: [[MUL_I56:%.*]] = mul i32 [[AND_I55]], 65535 -; THR15-NEXT: [[SHR_I54_1:%.*]] = lshr i32 [[TMP151]], 15 -; THR15-NEXT: [[AND_I55_1:%.*]] = and i32 [[SHR_I54_1]], 65537 -; THR15-NEXT: [[MUL_I56_1:%.*]] = mul i32 [[AND_I55_1]], 65535 -; THR15-NEXT: [[TMP156:%.*]] = lshr <2 x i32> [[TMP124]], splat (i32 15) -; THR15-NEXT: [[TMP157:%.*]] = and <2 x i32> [[TMP156]], splat (i32 65537) -; THR15-NEXT: [[TMP158:%.*]] = mul <2 x i32> [[TMP157]], splat (i32 65535) -; THR15-NEXT: [[ADD78:%.*]] = add i32 [[ADD48_2]], [[ADD48]] -; THR15-NEXT: [[SUB86:%.*]] = sub i32 [[ADD48]], [[ADD48_2]] -; THR15-NEXT: [[ADD103:%.*]] = add i32 [[ADD94]], [[ADD78]] -; THR15-NEXT: [[SUB104:%.*]] = sub i32 [[ADD78]], [[ADD94]] -; THR15-NEXT: [[ADD105:%.*]] = add i32 [[SUB102]], [[SUB86]] -; THR15-NEXT: [[SUB106:%.*]] = sub i32 [[SUB86]], [[SUB102]] -; THR15-NEXT: [[ADD_I:%.*]] = add i32 [[MUL_I]], [[ADD103]] -; THR15-NEXT: [[XOR_I:%.*]] = xor i32 [[ADD_I]], [[CONV_3]] -; THR15-NEXT: [[ADD_I52:%.*]] = add i32 [[MUL_I51]], [[ADD105]] -; THR15-NEXT: [[XOR_I53:%.*]] = xor i32 [[ADD_I52]], [[TMP44]] -; THR15-NEXT: [[ADD_I57:%.*]] = add i32 [[MUL_I56]], [[SUB104]] -; THR15-NEXT: [[XOR_I58:%.*]] = xor i32 [[ADD_I57]], [[TMP145]] -; THR15-NEXT: [[ADD_I62:%.*]] = add i32 [[MUL_I61]], [[SUB106]] -; THR15-NEXT: [[XOR_I63:%.*]] = xor i32 [[ADD_I62]], [[TMP120]] -; THR15-NEXT: [[ADD110:%.*]] = add i32 [[XOR_I53]], [[XOR_I]] -; THR15-NEXT: [[ADD112:%.*]] = add i32 [[ADD110]], [[XOR_I58]] -; THR15-NEXT: [[ADD113:%.*]] = add i32 [[ADD112]], [[XOR_I63]] -; THR15-NEXT: [[ADD78_1:%.*]] = add i32 [[ADD48_1]], [[ADD55]] -; THR15-NEXT: [[SUB86_1:%.*]] = sub i32 [[ADD55]], [[ADD48_1]] -; THR15-NEXT: [[ADD103_1:%.*]] = add i32 [[ADD94_1]], [[ADD78_1]] -; THR15-NEXT: [[SUB104_1:%.*]] = sub i32 [[ADD78_1]], [[ADD94_1]] -; THR15-NEXT: [[ADD105_1:%.*]] = add i32 [[SUB102_1]], [[SUB86_1]] -; THR15-NEXT: [[SUB106_1:%.*]] = sub i32 [[SUB86_1]], [[SUB102_1]] -; THR15-NEXT: [[ADD_I_1:%.*]] = add i32 [[MUL_I_1]], [[ADD103_1]] -; THR15-NEXT: [[XOR_I_1:%.*]] = xor i32 [[ADD_I_1]], [[TMP86]] -; THR15-NEXT: [[ADD_I52_1:%.*]] = add i32 [[MUL_I51_1]], [[ADD105_1]] -; THR15-NEXT: [[XOR_I53_1:%.*]] = xor i32 [[ADD_I52_1]], [[TMP87]] -; THR15-NEXT: [[ADD_I57_1:%.*]] = add i32 [[MUL_I56_1]], [[SUB104_1]] -; THR15-NEXT: [[XOR_I58_1:%.*]] = xor i32 [[ADD_I57_1]], [[TMP151]] -; THR15-NEXT: [[ADD_I62_1:%.*]] = add i32 [[MUL_I61_1]], [[SUB106_1]] -; THR15-NEXT: [[XOR_I63_1:%.*]] = xor i32 [[ADD_I62_1]], [[TMP122]] -; THR15-NEXT: [[ADD108_1:%.*]] = add i32 [[XOR_I53_1]], [[ADD113]] -; THR15-NEXT: [[ADD110_1:%.*]] = add i32 [[ADD108_1]], [[XOR_I_1]] -; THR15-NEXT: [[ADD112_1:%.*]] = add i32 [[ADD110_1]], [[XOR_I58_1]] -; THR15-NEXT: [[ADD113_1:%.*]] = add i32 [[ADD112_1]], [[XOR_I63_1]] -; THR15-NEXT: [[ADD78_2:%.*]] = add i32 [[SUB45_1]], [[SUB51]] -; THR15-NEXT: [[SUB86_2:%.*]] = sub i32 [[SUB51]], [[SUB45_1]] -; THR15-NEXT: [[TMP159:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_2]], i32 0 -; THR15-NEXT: [[TMP160:%.*]] = shufflevector <2 x i32> [[TMP159]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP161:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_2]], i32 0 -; THR15-NEXT: [[TMP162:%.*]] = shufflevector <2 x i32> [[TMP161]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP163:%.*]] = add <2 x i32> [[TMP160]], [[TMP162]] -; THR15-NEXT: [[TMP164:%.*]] = sub <2 x i32> [[TMP160]], [[TMP162]] -; THR15-NEXT: [[TMP165:%.*]] = shufflevector <2 x i32> [[TMP163]], <2 x i32> [[TMP164]], <2 x i32> -; THR15-NEXT: [[ADD105_2:%.*]] = add i32 [[SUB102_2]], [[SUB86_2]] -; THR15-NEXT: [[SUB106_2:%.*]] = sub i32 [[SUB86_2]], [[SUB102_2]] -; THR15-NEXT: [[ADD_I52_2:%.*]] = add i32 [[MUL_I51_2]], [[ADD105_2]] -; THR15-NEXT: [[XOR_I53_2:%.*]] = xor i32 [[ADD_I52_2]], [[CONV_1]] -; THR15-NEXT: [[TMP166:%.*]] = add <2 x i32> [[TMP158]], [[TMP165]] -; THR15-NEXT: [[TMP167:%.*]] = xor <2 x i32> [[TMP166]], [[TMP124]] -; THR15-NEXT: [[SHR_I59_2:%.*]] = lshr i32 [[TMP119]], 15 -; THR15-NEXT: [[AND_I60_2:%.*]] = and i32 [[SHR_I59_2]], 65537 -; THR15-NEXT: [[MUL_I61_2:%.*]] = mul i32 [[AND_I60_2]], 65535 -; THR15-NEXT: [[ADD_I62_2:%.*]] = add i32 [[MUL_I61_2]], [[SUB106_2]] -; THR15-NEXT: [[XOR_I63_2:%.*]] = xor i32 [[ADD_I62_2]], [[TMP119]] -; THR15-NEXT: [[ADD108_2:%.*]] = add i32 [[XOR_I53_2]], [[ADD113_1]] -; THR15-NEXT: [[TMP168:%.*]] = extractelement <2 x i32> [[TMP167]], i32 0 -; THR15-NEXT: [[ADD110_2:%.*]] = add i32 [[ADD108_2]], [[TMP168]] -; THR15-NEXT: [[TMP169:%.*]] = extractelement <2 x i32> [[TMP167]], i32 1 -; THR15-NEXT: [[ADD112_2:%.*]] = add i32 [[ADD110_2]], [[TMP169]] -; THR15-NEXT: [[ADD113_2:%.*]] = add i32 [[ADD112_2]], [[XOR_I63_2]] -; THR15-NEXT: [[ADD78_3:%.*]] = add i32 [[SUB51_1]], [[SUB59]] -; THR15-NEXT: [[SUB86_3:%.*]] = sub i32 [[SUB59]], [[SUB51_1]] -; THR15-NEXT: [[TMP172:%.*]] = insertelement <2 x i32> poison, i32 [[ADD78_3]], i32 0 -; THR15-NEXT: [[TMP173:%.*]] = shufflevector <2 x i32> [[TMP172]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP174:%.*]] = insertelement <2 x i32> poison, i32 [[ADD94_3]], i32 0 -; THR15-NEXT: [[TMP175:%.*]] = shufflevector <2 x i32> [[TMP174]], <2 x i32> poison, <2 x i32> zeroinitializer -; THR15-NEXT: [[TMP176:%.*]] = add <2 x i32> [[TMP173]], [[TMP175]] -; THR15-NEXT: [[TMP177:%.*]] = sub <2 x i32> [[TMP173]], [[TMP175]] -; THR15-NEXT: [[TMP178:%.*]] = shufflevector <2 x i32> [[TMP176]], <2 x i32> [[TMP177]], <2 x i32> -; THR15-NEXT: [[ADD105_3:%.*]] = add i32 [[SUB102_3]], [[SUB86_3]] -; THR15-NEXT: [[SUB106_3:%.*]] = sub i32 [[SUB86_3]], [[SUB102_3]] -; THR15-NEXT: [[ADD_I52_3:%.*]] = add i32 [[MUL_I51_3]], [[ADD105_3]] -; THR15-NEXT: [[XOR_I53_3:%.*]] = xor i32 [[ADD_I52_3]], [[CONV]] -; THR15-NEXT: [[TMP179:%.*]] = lshr <2 x i32> [[TMP93]], splat (i32 15) -; THR15-NEXT: [[TMP180:%.*]] = and <2 x i32> [[TMP179]], splat (i32 65537) -; THR15-NEXT: [[TMP181:%.*]] = mul <2 x i32> [[TMP180]], splat (i32 65535) -; THR15-NEXT: [[TMP182:%.*]] = add <2 x i32> [[TMP181]], [[TMP178]] -; THR15-NEXT: [[TMP183:%.*]] = xor <2 x i32> [[TMP182]], [[TMP93]] -; THR15-NEXT: [[SHR_I59_3:%.*]] = lshr i32 [[CONV33]], 15 -; THR15-NEXT: [[AND_I60_3:%.*]] = and i32 [[SHR_I59_3]], 65537 -; THR15-NEXT: [[MUL_I61_3:%.*]] = mul i32 [[AND_I60_3]], 65535 -; THR15-NEXT: [[ADD_I62_3:%.*]] = add i32 [[MUL_I61_3]], [[SUB106_3]] -; THR15-NEXT: [[XOR_I63_3:%.*]] = xor i32 [[ADD_I62_3]], [[CONV33]] -; THR15-NEXT: [[ADD108_3:%.*]] = add i32 [[XOR_I53_3]], [[ADD113_2]] -; THR15-NEXT: [[TMP184:%.*]] = extractelement <2 x i32> [[TMP183]], i32 0 -; THR15-NEXT: [[ADD110_3:%.*]] = add i32 [[ADD108_3]], [[TMP184]] -; THR15-NEXT: [[TMP185:%.*]] = extractelement <2 x i32> [[TMP183]], i32 1 -; THR15-NEXT: [[ADD112_3:%.*]] = add i32 [[ADD110_3]], [[TMP185]] -; THR15-NEXT: [[ADD113_3:%.*]] = add i32 [[ADD112_3]], [[XOR_I63_3]] +; THR15-NEXT: [[TMP10:%.*]] = load <4 x i8>, ptr [[ADD_PTR_1]], align 1 +; THR15-NEXT: [[TMP11:%.*]] = load <4 x i8>, ptr [[ADD_PTR64_1]], align 1 +; THR15-NEXT: [[TMP12:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3_2]], align 1 +; THR15-NEXT: [[TMP13:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_2]], align 1 +; THR15-NEXT: [[TMP14:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP15:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP10]], i64 0) +; THR15-NEXT: [[TMP16:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP15]], <4 x i8> [[TMP14]], i64 4) +; THR15-NEXT: [[TMP17:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP16]], <4 x i8> [[TMP2]], i64 8) +; THR15-NEXT: [[TMP18:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP17]], <4 x i8> [[TMP6]], i64 12) +; THR15-NEXT: [[TMP19:%.*]] = zext <16 x i8> [[TMP18]] to <16 x i32> +; THR15-NEXT: [[TMP20:%.*]] = load <4 x i8>, ptr null, align 1 +; THR15-NEXT: [[TMP21:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP11]], i64 0) +; THR15-NEXT: [[TMP22:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP21]], <4 x i8> [[TMP20]], i64 4) +; THR15-NEXT: [[TMP23:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP22]], <4 x i8> [[TMP143]], i64 8) +; THR15-NEXT: [[TMP24:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP23]], <4 x i8> [[TMP148]], i64 12) +; THR15-NEXT: [[TMP25:%.*]] = zext <16 x i8> [[TMP24]] to <16 x i32> +; THR15-NEXT: [[TMP26:%.*]] = sub <16 x i32> [[TMP19]], [[TMP25]] +; THR15-NEXT: [[TMP27:%.*]] = shufflevector <16 x i32> [[TMP26]], <16 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP28:%.*]] = call <2 x i8> @llvm.experimental.vp.strided.load.v2i8.p0.i64(ptr align 1 null, i64 4, <2 x i1> splat (i1 true), i32 2) +; THR15-NEXT: [[TMP29:%.*]] = shufflevector <2 x i8> [[TMP28]], <2 x i8> poison, <4 x i32> +; THR15-NEXT: [[TMP30:%.*]] = shufflevector <4 x i8> [[TMP12]], <4 x i8> [[TMP29]], <16 x i32> +; THR15-NEXT: [[TMP31:%.*]] = shufflevector <4 x i8> [[TMP152]], <4 x i8> poison, <16 x i32> +; THR15-NEXT: [[TMP32:%.*]] = shufflevector <16 x i8> [[TMP30]], <16 x i8> [[TMP31]], <16 x i32> +; THR15-NEXT: [[TMP33:%.*]] = shufflevector <4 x i8> [[TMP146]], <4 x i8> poison, <16 x i32> +; THR15-NEXT: [[TMP34:%.*]] = shufflevector <16 x i8> [[TMP32]], <16 x i8> [[TMP33]], <16 x i32> +; THR15-NEXT: [[TMP35:%.*]] = insertelement <16 x i8> [[TMP34]], i8 [[TMP1]], i32 5 +; THR15-NEXT: [[TMP36:%.*]] = insertelement <16 x i8> [[TMP35]], i8 [[TMP48]], i32 9 +; THR15-NEXT: [[TMP37:%.*]] = zext <16 x i8> [[TMP36]] to <16 x i32> +; THR15-NEXT: [[TMP38:%.*]] = load <4 x i8>, ptr [[ARRAYIDX5_3]], align 1 +; THR15-NEXT: [[TMP39:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> poison, <4 x i8> [[TMP13]], i64 0) +; THR15-NEXT: [[TMP40:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP39]], <4 x i8> [[TMP38]], i64 4) +; THR15-NEXT: [[TMP41:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP40]], <4 x i8> [[TMP147]], i64 8) +; THR15-NEXT: [[TMP42:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v4i8(<16 x i8> [[TMP41]], <4 x i8> [[TMP153]], i64 12) +; THR15-NEXT: [[TMP43:%.*]] = zext <16 x i8> [[TMP42]] to <16 x i32> +; THR15-NEXT: [[TMP44:%.*]] = shufflevector <16 x i32> [[TMP43]], <16 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP45:%.*]] = sub <16 x i32> [[TMP37]], [[TMP44]] +; THR15-NEXT: [[TMP46:%.*]] = shl <16 x i32> [[TMP45]], splat (i32 16) +; THR15-NEXT: [[TMP47:%.*]] = add <16 x i32> [[TMP46]], [[TMP27]] +; THR15-NEXT: [[TMP70:%.*]] = shufflevector <16 x i32> [[TMP47]], <16 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP49:%.*]] = add <16 x i32> [[TMP47]], [[TMP70]] +; THR15-NEXT: [[TMP50:%.*]] = sub <16 x i32> [[TMP47]], [[TMP70]] +; THR15-NEXT: [[TMP51:%.*]] = shufflevector <16 x i32> [[TMP49]], <16 x i32> [[TMP50]], <16 x i32> +; THR15-NEXT: [[TMP52:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP53:%.*]] = add <16 x i32> [[TMP51]], [[TMP52]] +; THR15-NEXT: [[TMP54:%.*]] = sub <16 x i32> [[TMP51]], [[TMP52]] +; THR15-NEXT: [[TMP55:%.*]] = shufflevector <16 x i32> [[TMP53]], <16 x i32> [[TMP54]], <16 x i32> +; THR15-NEXT: [[TMP56:%.*]] = shufflevector <16 x i32> [[TMP55]], <16 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP57:%.*]] = sub <16 x i32> [[TMP55]], [[TMP56]] +; THR15-NEXT: [[TMP58:%.*]] = add <16 x i32> [[TMP55]], [[TMP56]] +; THR15-NEXT: [[TMP59:%.*]] = shufflevector <16 x i32> [[TMP57]], <16 x i32> [[TMP58]], <16 x i32> +; THR15-NEXT: [[TMP60:%.*]] = shufflevector <16 x i32> [[TMP59]], <16 x i32> poison, <16 x i32> +; THR15-NEXT: [[TMP61:%.*]] = add <16 x i32> [[TMP59]], [[TMP60]] +; THR15-NEXT: [[TMP62:%.*]] = sub <16 x i32> [[TMP59]], [[TMP60]] +; THR15-NEXT: [[TMP63:%.*]] = shufflevector <16 x i32> [[TMP61]], <16 x i32> [[TMP62]], <16 x i32> +; THR15-NEXT: [[TMP64:%.*]] = shufflevector <16 x i32> [[TMP51]], <16 x i32> [[TMP19]], <16 x i32> +; THR15-NEXT: [[TMP65:%.*]] = lshr <16 x i32> [[TMP64]], splat (i32 15) +; THR15-NEXT: [[TMP66:%.*]] = and <16 x i32> [[TMP65]], splat (i32 65537) +; THR15-NEXT: [[TMP67:%.*]] = mul <16 x i32> [[TMP66]], splat (i32 65535) +; THR15-NEXT: [[TMP68:%.*]] = add <16 x i32> [[TMP67]], [[TMP63]] +; THR15-NEXT: [[TMP69:%.*]] = xor <16 x i32> [[TMP68]], [[TMP64]] +; THR15-NEXT: [[ADD113_3:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP69]]) ; THR15-NEXT: ret i32 [[ADD113_3]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll index 5b0f4a69de4c3..7723746dda301 100644 --- a/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll +++ b/llvm/test/Transforms/SLPVectorizer/RISCV/reductions.ll @@ -342,8 +342,8 @@ define void @reduce_or_2() { ; ZVFHMIN-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 6 ; ZVFHMIN-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer ; ZVFHMIN-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]] -; ZVFHMIN-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) -; ZVFHMIN-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] +; ZVFHMIN-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) +; ZVFHMIN-NEXT: br i1 [[TMP6]], label [[TMP8:%.*]], label [[TMP7:%.*]] ; ZVFHMIN: 7: ; ZVFHMIN-NEXT: ret void ; ZVFHMIN: 8: @@ -356,8 +356,8 @@ define void @reduce_or_2() { ; ZVL128-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 6 ; ZVL128-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer ; ZVL128-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]] -; ZVL128-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) -; ZVL128-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] +; ZVL128-NEXT: [[TMP6:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) +; ZVL128-NEXT: br i1 [[TMP6]], label [[TMP8:%.*]], label [[TMP7:%.*]] ; ZVL128: 7: ; ZVL128-NEXT: ret void ; ZVL128: 8: @@ -365,16 +365,14 @@ define void @reduce_or_2() { ; ; ZVL256-LABEL: @reduce_or_2( ; ZVL256-NEXT: [[TMP1:%.*]] = shl i64 0, 0 -; ZVL256-NEXT: [[TMP2:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 15 -; ZVL256-NEXT: [[TMP3:%.*]] = icmp ult <16 x i64> [[TMP2]], zeroinitializer -; ZVL256-NEXT: [[TMP4:%.*]] = insertelement <16 x i64> , i64 [[TMP1]], i32 6 -; ZVL256-NEXT: [[TMP5:%.*]] = icmp ult <16 x i64> [[TMP4]], zeroinitializer -; ZVL256-NEXT: [[RDX_OP:%.*]] = or <16 x i1> [[TMP3]], [[TMP5]] -; ZVL256-NEXT: [[OP_RDX:%.*]] = call i1 @llvm.vector.reduce.or.v16i1(<16 x i1> [[RDX_OP]]) -; ZVL256-NEXT: br i1 [[OP_RDX]], label [[TMP9:%.*]], label [[TMP8:%.*]] -; ZVL256: 7: +; ZVL256-NEXT: [[TMP2:%.*]] = insertelement <32 x i64> , i64 [[TMP1]], i32 15 +; ZVL256-NEXT: [[TMP3:%.*]] = shufflevector <32 x i64> [[TMP2]], <32 x i64> poison, <32 x i32> +; ZVL256-NEXT: [[TMP4:%.*]] = icmp ult <32 x i64> [[TMP3]], zeroinitializer +; ZVL256-NEXT: [[TMP5:%.*]] = call i1 @llvm.vector.reduce.or.v32i1(<32 x i1> [[TMP4]]) +; ZVL256-NEXT: br i1 [[TMP5]], label [[TMP7:%.*]], label [[TMP6:%.*]] +; ZVL256: 6: ; ZVL256-NEXT: ret void -; ZVL256: 8: +; ZVL256: 7: ; ZVL256-NEXT: ret void ; ; ZVL512-LABEL: @reduce_or_2(