diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index 723f6aea1b76f..96e3d3d47f2d0 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -371,6 +371,9 @@ LLVM_ABI bool canSinkOrHoistInst(Instruction &I, AAResults *AA, /// Returns the llvm.vector.reduce intrinsic that corresponds to the recurrence /// kind. LLVM_ABI constexpr Intrinsic::ID getReductionIntrinsicID(RecurKind RK); +/// Returns the llvm.vector.reduce min/max intrinsic that corresponds to the +/// intrinsic op. +LLVM_ABI Intrinsic::ID getMinMaxReductionIntrinsicID(Intrinsic::ID IID); /// Returns the arithmetic instruction opcode used when expanding a reduction. LLVM_ABI unsigned getArithmeticReductionInstruction(Intrinsic::ID RdxID); diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 2d830f3b6f952..843364eb34f83 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -956,6 +956,21 @@ constexpr Intrinsic::ID llvm::getReductionIntrinsicID(RecurKind RK) { } } +Intrinsic::ID llvm::getMinMaxReductionIntrinsicID(Intrinsic::ID IID) { + switch (IID) { + default: + llvm_unreachable("Unexpected intrinsic id"); + case Intrinsic::umin: + return Intrinsic::vector_reduce_umin; + case Intrinsic::umax: + return Intrinsic::vector_reduce_umax; + case Intrinsic::smin: + return Intrinsic::vector_reduce_smin; + case Intrinsic::smax: + return Intrinsic::vector_reduce_smax; + } +} + // This is the inverse to getReductionForBinop unsigned llvm::getArithmeticReductionInstruction(Intrinsic::ID RdxID) { switch (RdxID) { diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 613dec121e016..221970f1ebb41 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -135,6 +135,7 @@ class VectorCombine { bool foldShuffleOfIntrinsics(Instruction &I); bool foldShuffleToIdentity(Instruction &I); bool foldShuffleFromReductions(Instruction &I); + bool foldShuffleChainsToReduce(Instruction &I); bool foldCastFromReductions(Instruction &I); bool foldSelectShuffle(Instruction &I, bool FromReduction = false); bool foldInterleaveIntrinsics(Instruction &I); @@ -3136,6 +3137,267 @@ bool VectorCombine::foldShuffleFromReductions(Instruction &I) { return MadeChanges; } +/// For a given chain of patterns of the following form: +/// +/// ``` +/// %1 = shufflevector %0, poison mask +/// +/// %2 = tail call llvm.( %0, %1) +/// OR +/// %2 = add/mul/or/and/xor %0, %1 +/// +/// %3 = shufflevector %2, poison mask +/// ... +/// ... +/// %(i - 1) = tail call llvm.( %(i - +/// 3), %(i - 2) +/// OR +/// %(i - 1) = add/mul/or/and/xor %(i - 3), %(i - 2) +/// +/// %(i) = extractelement %(i - 1), 0 +/// ``` +/// +/// Where: +/// `mask` follows a partition pattern: +/// +/// Ex: +/// [n = 8, p = poison] +/// +/// 4 5 6 7 | p p p p +/// 2 3 | p p p p p p +/// 1 | p p p p p p p +/// +/// For powers of 2, there's a consistent pattern, but for other cases +/// the parity of the current half value at each step decides the +/// next partition half (see `ExpectedParityMask` for more logical details +/// in generalising this). +/// +/// Ex: +/// [n = 6] +/// +/// 3 4 5 | p p p +/// 1 2 | p p p p +/// 1 | p p p p p +bool VectorCombine::foldShuffleChainsToReduce(Instruction &I) { + // Going bottom-up for the pattern. + std::queue InstWorklist; + InstructionCost OrigCost = 0; + + // Common instruction operation after each shuffle op. + std::optional CommonCallOp = std::nullopt; + std::optional CommonBinOp = std::nullopt; + + bool IsFirstCallOrBinInst = true; + bool ShouldBeCallOrBinInst = true; + + // This stores the last used instructions for shuffle/common op. + // + // PrevVecV[0] / PrevVecV[1] store the last two simultaneous + // instructions from either shuffle/common op. + SmallVector PrevVecV(2, nullptr); + + Value *VecOpEE; + if (!match(&I, m_ExtractElt(m_Value(VecOpEE), m_Zero()))) + return false; + + auto *FVT = dyn_cast(VecOpEE->getType()); + if (!FVT) + return false; + + int64_t VecSize = FVT->getNumElements(); + if (VecSize < 2) + return false; + + // Number of levels would be ~log2(n), considering we always partition + // by half for this fold pattern. + unsigned int NumLevels = Log2_64_Ceil(VecSize), VisitedCnt = 0; + int64_t ShuffleMaskHalf = 1, ExpectedParityMask = 0; + + // This is how we generalise for all element sizes. + // At each step, if vector size is odd, we need non-poison + // values to cover the dominant half so we don't miss out on any element. + // + // This mask will help us retrieve this as we go from bottom to top: + // + // Mask Set -> N = N * 2 - 1 + // Mask Unset -> N = N * 2 + for (int Cur = VecSize, Mask = NumLevels - 1; Cur > 1; + Cur = (Cur + 1) / 2, --Mask) { + if (Cur & 1) + ExpectedParityMask |= (1ll << Mask); + } + + InstWorklist.push(VecOpEE); + + while (!InstWorklist.empty()) { + Value *CI = InstWorklist.front(); + InstWorklist.pop(); + + if (auto *II = dyn_cast(CI)) { + if (!ShouldBeCallOrBinInst) + return false; + + if (!IsFirstCallOrBinInst && + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + // For the first found call/bin op, the vector has to come from the + // extract element op. + if (II != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0])) + return false; + IsFirstCallOrBinInst = false; + + if (!CommonCallOp) + CommonCallOp = II->getIntrinsicID(); + if (II->getIntrinsicID() != *CommonCallOp) + return false; + + switch (II->getIntrinsicID()) { + case Intrinsic::umin: + case Intrinsic::umax: + case Intrinsic::smin: + case Intrinsic::smax: { + auto *Op0 = II->getOperand(0); + auto *Op1 = II->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: + return false; + } + ShouldBeCallOrBinInst ^= 1; + + IntrinsicCostAttributes ICA( + *CommonCallOp, II->getType(), + {PrevVecV[0]->getType(), PrevVecV[1]->getType()}); + OrigCost += TTI.getIntrinsicInstrCost(ICA, CostKind); + + // We may need a swap here since it can be (a, b) or (b, a) + // and accordingly change as we go up. + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *BinOp = dyn_cast(CI)) { + // Similar logic for bin ops. + + if (!ShouldBeCallOrBinInst) + return false; + + if (!IsFirstCallOrBinInst && + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (BinOp != (IsFirstCallOrBinInst ? VecOpEE : PrevVecV[0])) + return false; + IsFirstCallOrBinInst = false; + + if (!CommonBinOp) + CommonBinOp = BinOp->getOpcode(); + + if (BinOp->getOpcode() != *CommonBinOp) + return false; + + switch (*CommonBinOp) { + case BinaryOperator::Add: + case BinaryOperator::Mul: + case BinaryOperator::Or: + case BinaryOperator::And: + case BinaryOperator::Xor: { + auto *Op0 = BinOp->getOperand(0); + auto *Op1 = BinOp->getOperand(1); + PrevVecV[0] = Op0; + PrevVecV[1] = Op1; + break; + } + default: + return false; + } + ShouldBeCallOrBinInst ^= 1; + + OrigCost += + TTI.getArithmeticInstrCost(*CommonBinOp, BinOp->getType(), CostKind); + + if (!isa(PrevVecV[1])) + std::swap(PrevVecV[0], PrevVecV[1]); + InstWorklist.push(PrevVecV[1]); + InstWorklist.push(PrevVecV[0]); + } else if (auto *SVInst = dyn_cast(CI)) { + // We shouldn't have any null values in the previous vectors, + // is so, there was a mismatch in pattern. + if (ShouldBeCallOrBinInst || + any_of(PrevVecV, [](Value *VecV) { return VecV == nullptr; })) + return false; + + if (SVInst != PrevVecV[1]) + return false; + + ArrayRef CurMask; + if (!match(SVInst, m_Shuffle(m_Specific(PrevVecV[0]), m_Poison(), + m_Mask(CurMask)))) + return false; + + // Subtract the parity mask when checking the condition. + for (int Mask = 0, MaskSize = CurMask.size(); Mask != MaskSize; ++Mask) { + if (Mask < ShuffleMaskHalf && + CurMask[Mask] != ShuffleMaskHalf + Mask - (ExpectedParityMask & 1)) + return false; + if (Mask >= ShuffleMaskHalf && CurMask[Mask] != -1) + return false; + } + + // Update mask values. + ShuffleMaskHalf *= 2; + ShuffleMaskHalf -= (ExpectedParityMask & 1); + ExpectedParityMask >>= 1; + + OrigCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + SVInst->getType(), SVInst->getType(), + CurMask, CostKind); + + VisitedCnt += 1; + if (!ExpectedParityMask && VisitedCnt == NumLevels) + break; + + ShouldBeCallOrBinInst ^= 1; + } else { + return false; + } + } + + // Pattern should end with a shuffle op. + if (ShouldBeCallOrBinInst) + return false; + + assert(VecSize != -1 && "Expected Match for Vector Size"); + + Value *FinalVecV = PrevVecV[0]; + if (!FinalVecV) + return false; + + auto *FinalVecVTy = cast(FinalVecV->getType()); + + Intrinsic::ID ReducedOp = + (CommonCallOp ? getMinMaxReductionIntrinsicID(*CommonCallOp) + : getReductionForBinop(*CommonBinOp)); + if (!ReducedOp) + return false; + + IntrinsicCostAttributes ICA(ReducedOp, FinalVecVTy, {FinalVecV}); + InstructionCost NewCost = TTI.getIntrinsicInstrCost(ICA, CostKind); + + if (NewCost >= OrigCost) + return false; + + auto *ReducedResult = + Builder.CreateIntrinsic(ReducedOp, {FinalVecV->getType()}, {FinalVecV}); + replaceValue(I, *ReducedResult); + + return true; +} + /// Determine if its more efficient to fold: /// reduce(trunc(x)) -> trunc(reduce(x)). /// reduce(sext(x)) -> sext(reduce(x)). @@ -4223,6 +4485,9 @@ bool VectorCombine::run() { if (foldCastFromReductions(I)) return true; break; + case Instruction::ExtractElement: + if (foldShuffleChainsToReduce(I)) + return true; case Instruction::ICmp: case Instruction::FCmp: if (foldExtractExtract(I)) diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll new file mode 100644 index 0000000000000..82b20ccc5b8f5 --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-chain-reduction-umin.ll @@ -0,0 +1,200 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes=vector-combine -S %s | FileCheck %s +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v4 -passes=vector-combine -S %s | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i8 @test_reduce_v16i8(<16 x i8> %a0) { +; +; CHECK-LABEL: define i8 @test_reduce_v16i8( +; CHECK-SAME: <16 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP8:%.*]] = call i8 @llvm.vector.reduce.umin.v16i8(<16 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP8]] +; + %1 = shufflevector <16 x i8> %a0, <16 x i8> poison, <16 x i32> + %2 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %a0, <16 x i8> %1) + %3 = shufflevector <16 x i8> %2, <16 x i8> poison, <16 x i32> + %4 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %2, <16 x i8> %3) + %5 = shufflevector <16 x i8> %4, <16 x i8> poison, <16 x i32> + %6 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %4, <16 x i8> %5) + %7 = shufflevector <16 x i8> %6, <16 x i8> poison, <16 x i32> + %8 = tail call <16 x i8> @llvm.umin.v16i8(<16 x i8> %6, <16 x i8> %7) + %9 = extractelement <16 x i8> %8, i64 0 + ret i8 %9 +} + +define i8 @test_reduce_v32i8(<32 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v32i8( +; CHECK-SAME: <32 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v32i8(<32 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <32 x i8> %a0, <32 x i8> poison, <32 x i32> + %2 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %a0, <32 x i8> %1) + %3 = shufflevector <32 x i8> %2, <32 x i8> poison, <32 x i32> + %4 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %2, <32 x i8> %3) + %5 = shufflevector <32 x i8> %4, <32 x i8> poison, <32 x i32> + %6 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %4, <32 x i8> %5) + %7 = shufflevector <32 x i8> %6, <32 x i8> poison, <32 x i32> + %8 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %6, <32 x i8> %7) + %9 = shufflevector <32 x i8> %8, <32 x i8> poison, <32 x i32> + %10 = tail call <32 x i8> @llvm.umin.v32i8(<32 x i8> %8, <32 x i8> %9) + %11 = extractelement <32 x i8> %10, i64 0 + ret i8 %11 +} + +define i16 @test_reduce_v16i16(<16 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v16i16( +; CHECK-SAME: <16 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v16i16(<16 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <16 x i16> %a0, <16 x i16> poison, <16 x i32> + %2 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %a0, <16 x i16> %1) + %3 = shufflevector <16 x i16> %2, <16 x i16> poison, <16 x i32> + %4 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %2, <16 x i16> %3) + %5 = shufflevector <16 x i16> %4, <16 x i16> poison, <16 x i32> + %6 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %4, <16 x i16> %5) + %7 = shufflevector <16 x i16> %6, <16 x i16> poison, <16 x i32> + %8 = tail call <16 x i16> @llvm.umin.v16i16(<16 x i16> %6, <16 x i16> %7) + %9 = extractelement <16 x i16> %8, i64 0 + ret i16 %9 +} + +define i8 @test_reduce_v64i8(<64 x i8> %a0) { +; CHECK-LABEL: define i8 @test_reduce_v64i8( +; CHECK-SAME: <64 x i8> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.vector.reduce.umin.v64i8(<64 x i8> [[A0]]) +; CHECK-NEXT: ret i8 [[TMP1]] +; + %1 = shufflevector <64 x i8> %a0, <64 x i8> poison, <64 x i32> + %2 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %a0, <64 x i8> %1) + %3 = shufflevector <64 x i8> %2, <64 x i8> poison, <64 x i32> + %4 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %2, <64 x i8> %3) + %5 = shufflevector <64 x i8> %4, <64 x i8> poison, <64 x i32> + %6 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %4, <64 x i8> %5) + %7 = shufflevector <64 x i8> %6, <64 x i8> poison, <64 x i32> + %8 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %6, <64 x i8> %7) + %9 = shufflevector <64 x i8> %8, <64 x i8> poison, <64 x i32> + %10 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %8, <64 x i8> %9) + %11 = shufflevector <64 x i8> %10, <64 x i8> poison, <64 x i32> + %12 = tail call <64 x i8> @llvm.umin.v64i8(<64 x i8> %10, <64 x i8> %11) + %13 = extractelement <64 x i8> %12, i64 0 + ret i8 %13 +} + +define i16 @test_reduce_v32i16(<32 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v32i16( +; CHECK-SAME: <32 x i16> [[A0:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v32i16(<32 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <32 x i16> %a0, <32 x i16> poison, <32 x i32> + %2 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %a0, <32 x i16> %1) + %3 = shufflevector <32 x i16> %2, <32 x i16> poison, <32 x i32> + %4 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %2, <32 x i16> %3) + %5 = shufflevector <32 x i16> %4, <32 x i16> poison, <32 x i32> + %6 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %4, <32 x i16> %5) + %7 = shufflevector <32 x i16> %6, <32 x i16> poison, <32 x i32> + %8 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %6, <32 x i16> %7) + %9 = shufflevector <32 x i16> %8, <32 x i16> poison, <32 x i32> + %10 = tail call <32 x i16> @llvm.umin.v32i16(<32 x i16> %8, <32 x i16> %9) + %11 = extractelement <32 x i16> %10, i64 0 + ret i16 %11 +} diff --git a/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll new file mode 100644 index 0000000000000..403ce33b5344e --- /dev/null +++ b/llvm/test/Transforms/VectorCombine/fold-shuffle-chains-to-reduce.ll @@ -0,0 +1,195 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=vector-combine -S | FileCheck %s + +define i16 @test_reduce_v8i16(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v7i16_or(<7 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v7i16_or( +; CHECK-SAME: <7 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.or.v7i16(<7 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <7 x i16> %a0, <7 x i16> poison, <7 x i32> + %2 = or <7 x i16> %a0, %1 + %3 = shufflevector <7 x i16> %2, <7 x i16> poison, <7 x i32> + %4 = or <7 x i16> %2, %3 + %5 = shufflevector <7 x i16> %4, <7 x i16> poison, <7 x i32> + %6 = or <7 x i16> %4, %5 + %7 = extractelement <7 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v3i16_and(<3 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v3i16_and( +; CHECK-SAME: <3 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.and.v3i16(<3 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <3 x i16> %a0, <3 x i16> poison, <3 x i32> + %2 = and <3 x i16> %a0, %1 + %3 = shufflevector <3 x i16> %2, <3 x i16> poison, <3 x i32> + %4 = and <3 x i16> %2, %3 + %5 = extractelement <3 x i16> %4, i64 0 + ret i16 %5 +} + +define i16 @test_reduce_v6i16_xor(<6 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v6i16_xor( +; CHECK-SAME: <6 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i16 @llvm.vector.reduce.xor.v6i16(<6 x i16> [[A0]]) +; CHECK-NEXT: ret i16 [[TMP1]] +; + %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> + %2 = xor <6 x i16> %a0, %1 + %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> + %4 = xor <6 x i16> %2, %3 + %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> + %6 = xor <6 x i16> %4, %5 + %7 = extractelement <6 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP13:%.*]] = call i16 @llvm.vector.reduce.umin.v8i16(<8 x i16> [[A0]]) +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <8 x i16> [[TMP6]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP9:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP8]]) +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <8 x i16> [[TMP9]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP11:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP9]], <8 x i16> [[TMP10]]) +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <8 x i16> [[TMP11]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP11]], <8 x i16> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <8 x i16> [[TMP16]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = tail call i16 @llvm.umin.i16(i16 [[TMP13]], i16 [[TMP14]]) +; CHECK-NEXT: ret i16 [[TMP15]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + + %8 = shufflevector <8 x i16> %6, <8 x i16> poison, <8 x i32> + %9 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %8) + %10 = shufflevector <8 x i16> %9, <8 x i16> poison, <8 x i32> + %11 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %9, <8 x i16> %10) + %12 = shufflevector <8 x i16> %11, <8 x i16> poison, <8 x i32> + %13 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %11, <8 x i16> %12) + %14 = extractelement <8 x i16> %13, i64 0 + + %15 = tail call i16 @llvm.umin.i16(i16 %7, i16 %14) + + ret i16 %15 +} + +define i16 @test_reduce_v8i16_neg1(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg1( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg2(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg2( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> [[TMP4]], <8 x i16> [[TMP5]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <8 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %6 = tail call <8 x i16> @llvm.umax.v8i16(<8 x i16> %4, <8 x i16> %5) + %7 = extractelement <8 x i16> %6, i64 0 + ret i16 %7 +} + +define i16 @test_reduce_v8i16_neg3(<8 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v8i16_neg3( +; CHECK-SAME: <8 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A0]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[A0]], <8 x i16> [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP2]], <8 x i16> [[TMP3]]) +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[TMP4]], <8 x i16> poison, <8 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> [[TMP5]], <8 x i16> [[TMP6]]) +; CHECK-NEXT: [[TMP8:%.*]] = extractelement <8 x i16> [[TMP7]], i64 0 +; CHECK-NEXT: ret i16 [[TMP8]] +; + %1 = shufflevector <8 x i16> %a0, <8 x i16> poison, <8 x i32> + %2 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %a0, <8 x i16> %1) + %3 = shufflevector <8 x i16> %2, <8 x i16> poison, <8 x i32> + %4 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %5 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %2, <8 x i16> %3) + %6 = shufflevector <8 x i16> %4, <8 x i16> poison, <8 x i32> + %7 = tail call <8 x i16> @llvm.umin.v8i16(<8 x i16> %5, <8 x i16> %6) + %8 = extractelement <8 x i16> %7, i64 0 + ret i16 %8 +} + +define i16 @test_reduce_v6i16_xor_neg(<6 x i16> %a0) { +; CHECK-LABEL: define i16 @test_reduce_v6i16_xor_neg( +; CHECK-SAME: <6 x i16> [[A0:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <6 x i16> [[A0]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = xor <6 x i16> [[A0]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <6 x i16> [[TMP2]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = xor <6 x i16> [[TMP2]], [[TMP3]] +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <6 x i16> [[TMP4]], <6 x i16> poison, <6 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = xor <6 x i16> [[TMP4]], [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <6 x i16> [[TMP6]], i64 0 +; CHECK-NEXT: ret i16 [[TMP7]] +; + %1 = shufflevector <6 x i16> %a0, <6 x i16> poison, <6 x i32> + %2 = xor <6 x i16> %a0, %1 + %3 = shufflevector <6 x i16> %2, <6 x i16> poison, <6 x i32> + %4 = xor <6 x i16> %2, %3 + %5 = shufflevector <6 x i16> %4, <6 x i16> poison, <6 x i32> + %6 = xor <6 x i16> %4, %5 + %7 = extractelement <6 x i16> %6, i64 0 + ret i16 %7 +}