diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp index 83b42f6d1794d..772efcdf8f9fc 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp @@ -21,6 +21,7 @@ #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Debug.h" +#include "llvm/Support/InstructionCost.h" #include "llvm/Support/MathExtras.h" using namespace llvm; @@ -1396,30 +1397,86 @@ InstructionCost SystemZTTIImpl::getInterleavedMemoryOpCost( return NumVectorMemOps + NumPermutes; } +InstructionCost getIntAddReductionCost(unsigned NumVec, unsigned ScalarBits) { + InstructionCost Cost = 0; + // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. + Cost += NumVec - 1; + // For integer adds, VSUM creates shorter reductions on the final vector. + Cost += (ScalarBits < 32) ? 3 : 2; + return Cost; +} + +InstructionCost getFastReductionCost(unsigned NumVec, unsigned NumElems, + unsigned ScalarBits) { + unsigned NumEltsPerVecReg = (SystemZ::VectorBits / ScalarBits); + InstructionCost Cost = 0; + // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. + Cost += NumVec - 1; + // For each shuffle / arithmetic layer, we need 2 instructions, and we need + // log2(Elements in Last Vector) layers. + Cost += 2 * Log2_32_Ceil(std::min(NumElems, NumEltsPerVecReg)); + return Cost; +} + +inline bool customCostReductions(unsigned Opcode) { + return Opcode == Instruction::FAdd || Opcode == Instruction::FMul || + Opcode == Instruction::Add || Opcode == Instruction::Mul; +} + +InstructionCost +SystemZTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + std::optional FMF, + TTI::TargetCostKind CostKind) { + unsigned ScalarBits = Ty->getScalarSizeInBits(); + // The following is only for subtargets with vector math, non-ordered + // reductions, and reasonable scalar sizes for int and fp add/mul. + if (customCostReductions(Opcode) && ST->hasVector() && + !TTI::requiresOrderedReduction(FMF) && + ScalarBits <= SystemZ::VectorBits) { + unsigned NumVectors = getNumVectorRegs(Ty); + unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); + // Integer Add is using custom code gen, that needs to be accounted for. + if (Opcode == Instruction::Add) + return getIntAddReductionCost(NumVectors, ScalarBits); + // The base cost is the same across all other arithmetic instructions + InstructionCost Cost = + getFastReductionCost(NumVectors, NumElems, ScalarBits); + // But we need to account for the final op involving the scalar operand. + if ((Opcode == Instruction::FAdd) || (Opcode == Instruction::FMul)) + Cost += 1; + return Cost; + } + // otherwise, fall back to the standard implementation + return BaseT::getArithmeticReductionCost(Opcode, Ty, FMF, CostKind); +} + +InstructionCost +SystemZTTIImpl::getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, + FastMathFlags FMF, + TTI::TargetCostKind CostKind) { + // Return custom costs only on subtargets with vector enhancements. + if (ST->hasVectorEnhancements1()) { + unsigned NumVectors = getNumVectorRegs(Ty); + unsigned NumElems = ((FixedVectorType *)Ty)->getNumElements(); + unsigned ScalarBits = Ty->getScalarSizeInBits(); + InstructionCost Cost = 0; + // Binary Tree of N/2 + N/4 + ... operations yields N - 1 operations total. + Cost += NumVectors - 1; + // For the final vector, we need shuffle + min/max operations, and + // we need #Elements - 1 of them. + Cost += 2 * (std::min(NumElems, SystemZ::VectorBits / ScalarBits) - 1); + return Cost; + } + // For other targets, fall back to the standard implementation + return BaseT::getMinMaxReductionCost(IID, Ty, FMF, CostKind); +} + static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy, const SmallVectorImpl &ParamTys) { if (RetTy->isVectorTy() && ID == Intrinsic::bswap) return getNumVectorRegs(RetTy); // VPERM - if (ID == Intrinsic::vector_reduce_add) { - // Retrieve number and size of elements for the vector op. - auto *VTy = cast(ParamTys.front()); - unsigned ScalarSize = VTy->getScalarSizeInBits(); - // For scalar sizes >128 bits, we fall back to the generic cost estimate. - if (ScalarSize > SystemZ::VectorBits) - return -1; - // This many vector regs are needed to represent the input elements (V). - unsigned VectorRegsNeeded = getNumVectorRegs(VTy); - // This many instructions are needed for the final sum of vector elems (S). - unsigned LastVectorHandling = (ScalarSize < 32) ? 3 : 2; - // We use vector adds to create a sum vector, which takes - // V/2 + V/4 + ... = V - 1 operations. - // Then, we need S operations to sum up the elements of that sum vector, - // for a total of V + S - 1 operations. - int Cost = VectorRegsNeeded + LastVectorHandling - 1; - return Cost; - } return -1; } diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h index 6795da59bf5b1..512fcc854d532 100644 --- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h +++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h @@ -130,6 +130,13 @@ class SystemZTTIImpl : public BasicTTIImplBase { Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind, bool UseMaskForCond = false, bool UseMaskForGaps = false); + InstructionCost getArithmeticReductionCost(unsigned Opcode, VectorType *Ty, + std::optional FMF, + TTI::TargetCostKind CostKind); + InstructionCost getMinMaxReductionCost(Intrinsic::ID IID, VectorType *Ty, + FastMathFlags FMF, + TTI::TargetCostKind CostKind); + InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); diff --git a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll b/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll deleted file mode 100644 index 90b5b746c914a..0000000000000 --- a/llvm/test/Analysis/CostModel/SystemZ/reduce-add.ll +++ /dev/null @@ -1,128 +0,0 @@ -; RUN: opt < %s -mtriple=systemz-unknown -mcpu=z13 -passes="print" -cost-kind=throughput 2>&1 -disable-output | FileCheck %s - -define void @reduce(ptr %src, ptr %dst) { -; CHECK-LABEL: 'reduce' -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64) -; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64) -; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64) -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32) -; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32) -; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16) -; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8) -; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8) -; -; CHECK: Cost Model: Found an estimated cost of 10 for instruction: %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8) -; CHECK: Cost Model: Found an estimated cost of 20 for instruction: %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256) - - ; REDUCEADD64 - - %V2_64 = load <2 x i64>, ptr %src, align 8 - %R2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %V2_64) - store volatile i64 %R2_64, ptr %dst, align 4 - - %V4_64 = load <4 x i64>, ptr %src, align 8 - %R4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %V4_64) - store volatile i64 %R4_64, ptr %dst, align 4 - - %V8_64 = load <8 x i64>, ptr %src, align 8 - %R8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> %V8_64) - store volatile i64 %R8_64, ptr %dst, align 4 - - %V16_64 = load <16 x i64>, ptr %src, align 8 - %R16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %V16_64) - store volatile i64 %R16_64, ptr %dst, align 4 - - ; REDUCEADD32 - - %V2_32 = load <2 x i32>, ptr %src, align 8 - %R2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> %V2_32) - store volatile i32 %R2_32, ptr %dst, align 4 - - %V4_32 = load <4 x i32>, ptr %src, align 8 - %R4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %V4_32) - store volatile i32 %R4_32, ptr %dst, align 4 - - %V8_32 = load <8 x i32>, ptr %src, align 8 - %R8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> %V8_32) - store volatile i32 %R8_32, ptr %dst, align 4 - - %V16_32 = load <16 x i32>, ptr %src, align 8 - %R16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> %V16_32) - store volatile i32 %R16_32, ptr %dst, align 4 - - ; REDUCEADD16 - - %V2_16 = load <2 x i16>, ptr %src, align 8 - %R2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> %V2_16) - store volatile i16 %R2_16, ptr %dst, align 4 - - %V4_16 = load <4 x i16>, ptr %src, align 8 - %R4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> %V4_16) - store volatile i16 %R4_16, ptr %dst, align 4 - - %V8_16 = load <8 x i16>, ptr %src, align 8 - %R8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %V8_16) - store volatile i16 %R8_16, ptr %dst, align 4 - - %V16_16 = load <16 x i16>, ptr %src, align 8 - %R16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> %V16_16) - store volatile i16 %R16_16, ptr %dst, align 4 - - ; REDUCEADD8 - - %V2_8 = load <2 x i8>, ptr %src, align 8 - %R2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> %V2_8) - store volatile i8 %R2_8, ptr %dst, align 4 - - %V4_8 = load <4 x i8>, ptr %src, align 8 - %R4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> %V4_8) - store volatile i8 %R4_8, ptr %dst, align 4 - - %V8_8 = load <8 x i8>, ptr %src, align 8 - %R8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> %V8_8) - store volatile i8 %R8_8, ptr %dst, align 4 - - %V16_8 = load <16 x i8>, ptr %src, align 8 - %R16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %V16_8) - store volatile i8 %R16_8, ptr %dst, align 4 - - ; EXTREME VALUES - - %V128_8 = load <128 x i8>, ptr %src, align 8 - %R128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> %V128_8) - store volatile i8 %R128_8, ptr %dst, align 4 - - %V4_256 = load <4 x i256>, ptr %src, align 8 - %R4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> %V4_256) - store volatile i256 %R4_256, ptr %dst, align 8 - - ret void -} - -declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) -declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) -declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) -declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) -declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) -declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) -declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) -declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) -declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) -declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) -declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) -declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) -declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) -declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) -declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) -declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) - -declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) -declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>) diff --git a/llvm/test/Analysis/CostModel/SystemZ/vector-reductions.ll b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions.ll new file mode 100644 index 0000000000000..0def20215e988 --- /dev/null +++ b/llvm/test/Analysis/CostModel/SystemZ/vector-reductions.ll @@ -0,0 +1,376 @@ +; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py +; RUN: opt -passes='print' -disable-output -mtriple=s390x-unknown-linux \ +; RUN: -mcpu=z15 < %s 2>&1 | FileCheck %s --check-prefix=Z15 + +define void @fadd_reductions() { +; Z15-LABEL: 'fadd_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fadd_v4f32 = call float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + %fadd_v8f32 = call float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + %fadd_v2f64 = call double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + %fadd_v4f64 = call double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + %fadd_v4f128 = call fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) + ret void +} + +define void @fast_fadd_reductions(ptr %src, ptr %dst) { +; Z15-LABEL: 'fast_fadd_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.0, <4 x float> undef) + %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.0, <8 x float> undef) + %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.0, <2 x double> undef) + %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.0, <4 x double> undef) + %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) + ret void +} + +define void @fmul_reductions() { +; Z15-LABEL: 'fmul_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fmul_v4f32 = call float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef) + %fmul_v8f32 = call float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef) + %fmul_v2f64 = call double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef) + %fmul_v4f64 = call double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef) + %fmul_v4f128 = call fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) + ret void +} + +define void @fast_fmul_reductions() { +; Z15-LABEL: 'fast_fmul_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.000000e+00, <4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.000000e+00, <8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.000000e+00, <2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.000000e+00, <4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %fmul_v4f32 = call fast float @llvm.vector.reduce.fmul.v4f32(float 0.0, <4 x float> undef) + %fmul_v8f32 = call fast float @llvm.vector.reduce.fmul.v8f32(float 0.0, <8 x float> undef) + %fmul_v2f64 = call fast double @llvm.vector.reduce.fmul.v2f64(double 0.0, <2 x double> undef) + %fmul_v4f64 = call fast double @llvm.vector.reduce.fmul.v4f64(double 0.0, <4 x double> undef) + %fadd_v4f128 = call fast fp128 @llvm.vector.reduce.fmul.v4f128(fp128 undef, <4 x fp128> undef) + + ret void +} + +define void @fmin_reductions() { +; Z15-LABEL: 'fmin_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V4f32 = call float @llvm.vector.reduce.fmin.v4f32(<4 x float> undef) + %V8f32 = call float @llvm.vector.reduce.fmin.v8f32(<8 x float> undef) + %V2f64 = call double @llvm.vector.reduce.fmin.v2f64(<2 x double> undef) + %V4f64 = call double @llvm.vector.reduce.fmin.v4f64(<4 x double> undef) + %V4f128 = call fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128> undef) + ret void +} + +define void @fmax_reductions() { +; Z15-LABEL: 'fmax_reductions' +; Z15-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 7 for instruction: %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void +; + %V4f32 = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> undef) + %V8f32 = call float @llvm.vector.reduce.fmax.v8f32(<8 x float> undef) + %V2f64 = call double @llvm.vector.reduce.fmax.v2f64(<2 x double> undef) + %V4f64 = call double @llvm.vector.reduce.fmax.v4f64(<4 x double> undef) + %V4f128 = call fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128> undef) + ret void +} + +define void @reduceumin() { +; Z15-LABEL: 'reduceumin' +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef) +; + %V2_64 = call i64 @llvm.vector.reduce.umin.v2i64(<2 x i64> undef) + %V4_64 = call i64 @llvm.vector.reduce.umin.v4i64(<4 x i64> undef) + %V4_32 = call i32 @llvm.vector.reduce.umin.v4i32(<4 x i32> undef) + %V8_32 = call i32 @llvm.vector.reduce.umin.v8i32(<8 x i32> undef) + + %V128_8 = call i8 @llvm.vector.reduce.umin.v128i8(<128 x i8> undef) + %V4_128 = call i128 @llvm.vector.reduce.umin.v4i128(<4 x i128> undef) + + ret void +} + +define void @reduceumax() { +; Z15-LABEL: 'reduceumax' +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef) +; + %V2_64 = call i64 @llvm.vector.reduce.umax.v2i64(<2 x i64> undef) + %V4_64 = call i64 @llvm.vector.reduce.umax.v4i64(<4 x i64> undef) + %V4_32 = call i32 @llvm.vector.reduce.umax.v4i32(<4 x i32> undef) + %V8_32 = call i32 @llvm.vector.reduce.umax.v8i32(<8 x i32> undef) + + %V128_8 = call i8 @llvm.vector.reduce.umax.v128i8(<128 x i8> undef) + %V4_128 = call i128 @llvm.vector.reduce.umax.v4i128(<4 x i128> undef) + + ret void +} + +define void @reducesmin() { +; Z15-LABEL: 'reducesmin' +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef) +; + %V2_64 = call i64 @llvm.vector.reduce.smin.v2i64(<2 x i64> undef) + %V4_64 = call i64 @llvm.vector.reduce.smin.v4i64(<4 x i64> undef) + %V4_32 = call i32 @llvm.vector.reduce.smin.v4i32(<4 x i32> undef) + %V8_32 = call i32 @llvm.vector.reduce.smin.v8i32(<8 x i32> undef) + + %V128_8 = call i8 @llvm.vector.reduce.smin.v128i8(<128 x i8> undef) + %V4_128 = call i128 @llvm.vector.reduce.smin.v4i128(<4 x i128> undef) + + ret void +} + +define void @reducesmax() { +; Z15-LABEL: 'reducesmax' +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 6 for instruction: %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 7 for instruction: %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 37 for instruction: %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) +; Z15-NEXT Cost Model: Found an estimated cost of 3 for instruction: %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef) +; + %V2_64 = call i64 @llvm.vector.reduce.smax.v2i64(<2 x i64> undef) + %V4_64 = call i64 @llvm.vector.reduce.smax.v4i64(<4 x i64> undef) + %V4_32 = call i32 @llvm.vector.reduce.smax.v4i32(<4 x i32> undef) + %V8_32 = call i32 @llvm.vector.reduce.smax.v8i32(<8 x i32> undef) + + %V128_8 = call i8 @llvm.vector.reduce.smax.v128i8(<128 x i8> undef) + %V4_128 = call i128 @llvm.vector.reduce.smax.v4i128(<4 x i128> undef) + + ret void +} + +define void @reduceadd() { +; Z15-LABEL: 'reduceadd' +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) +; +; Z15-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) +; Z15-NEXT: Cost Model: Found an estimated cost of 20 for instruction: %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef) + + ; REDUCEADD64 + %V2_64 = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> undef) + %V4_64 = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> undef) + %V8_64 = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> undef) + %V16_64 = call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> undef) + ; REDUCEADD32 + %V2_32 = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> undef) + %V4_32 = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> undef) + %V8_32 = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> undef) + %V16_32 = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> undef) + ; REDUCEADD16 + %V2_16 = call i16 @llvm.vector.reduce.add.v2i16(<2 x i16> undef) + %V4_16 = call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> undef) + %V8_16 = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> undef) + %V16_16 = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> undef) + ; REDUCEADD8 + %V2_8 = call i8 @llvm.vector.reduce.add.v2i8(<2 x i8> undef) + %V4_8 = call i8 @llvm.vector.reduce.add.v4i8(<4 x i8> undef) + %V8_8 = call i8 @llvm.vector.reduce.add.v8i8(<8 x i8> undef) + %V16_8 = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> undef) + ; EXTREME VALUES + %V128_8 = call i8 @llvm.vector.reduce.add.v128i8(<128 x i8> undef) + %V4_256 = call i256 @llvm.vector.reduce.add.v4i256(<4 x i256> undef) + + ret void +} + +define void @reducemul() { +; CHECK-LABEL: 'reducemul' +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) +; CHECK: Cost Model: Found an estimated cost of 3 for instruction: %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) +; CHECK: Cost Model: Found an estimated cost of 9 for instruction: %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) +; CHECK: Cost Model: Found an estimated cost of 5 for instruction: %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) +; CHECK: Cost Model: Found an estimated cost of 7 for instruction: %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) +; CHECK: Cost Model: Found an estimated cost of 2 for instruction: %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) +; CHECK: Cost Model: Found an estimated cost of 4 for instruction: %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) +; CHECK: Cost Model: Found an estimated cost of 6 for instruction: %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) +; CHECK: Cost Model: Found an estimated cost of 8 for instruction: %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) +; +; CHECK: Cost Model: Found an estimated cost of 15 for instruction: %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) +; CHECK: Cost Model: Found an estimated cost of 28 for instruction: %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef) + + ; REDUCEADD64 + %V2_64 = call i64 @llvm.vector.reduce.mul.v2i64(<2 x i64> undef) + %V4_64 = call i64 @llvm.vector.reduce.mul.v4i64(<4 x i64> undef) + %V8_64 = call i64 @llvm.vector.reduce.mul.v8i64(<8 x i64> undef) + %V16_64 = call i64 @llvm.vector.reduce.mul.v16i64(<16 x i64> undef) + ; REDUCEADD32 + %V2_32 = call i32 @llvm.vector.reduce.mul.v2i32(<2 x i32> undef) + %V4_32 = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> undef) + %V8_32 = call i32 @llvm.vector.reduce.mul.v8i32(<8 x i32> undef) + %V16_32 = call i32 @llvm.vector.reduce.mul.v16i32(<16 x i32> undef) + ; REDUCEADD16 + %V2_16 = call i16 @llvm.vector.reduce.mul.v2i16(<2 x i16> undef) + %V4_16 = call i16 @llvm.vector.reduce.mul.v4i16(<4 x i16> undef) + %V8_16 = call i16 @llvm.vector.reduce.mul.v8i16(<8 x i16> undef) + %V16_16 = call i16 @llvm.vector.reduce.mul.v16i16(<16 x i16> undef) + ; REDUCEADD8 + %V2_8 = call i8 @llvm.vector.reduce.mul.v2i8(<2 x i8> undef) + %V4_8 = call i8 @llvm.vector.reduce.mul.v4i8(<4 x i8> undef) + %V8_8 = call i8 @llvm.vector.reduce.mul.v8i8(<8 x i8> undef) + %V16_8 = call i8 @llvm.vector.reduce.mul.v16i8(<16 x i8> undef) + ; EXTREME VALUES + %V128_8 = call i8 @llvm.vector.reduce.mul.v128i8(<128 x i8> undef) + %V4_256 = call i256 @llvm.vector.reduce.mul.v4i256(<4 x i256> undef) + + ret void +} + +declare float @llvm.vector.reduce.fadd.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fadd.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fadd.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fadd.v4f64(double, <4 x double>) +declare fp128 @llvm.vector.reduce.fadd.v4f128(fp128, <4 x fp128>) + +declare float @llvm.vector.reduce.fmul.v4f32(float, <4 x float>) +declare float @llvm.vector.reduce.fmul.v8f32(float, <8 x float>) +declare double @llvm.vector.reduce.fmul.v2f64(double, <2 x double>) +declare double @llvm.vector.reduce.fmul.v4f64(double, <4 x double>) +declare fp128 @llvm.vector.reduce.fmul.v4f128(fp128, <4 x fp128>) + +declare float @llvm.vector.reduce.fmin.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmin.v8f32(<8 x float>) +declare double @llvm.vector.reduce.fmin.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmin.v4f64(<4 x double>) +declare fp128 @llvm.vector.reduce.fmin.v4f128(<4 x fp128>) + +declare float @llvm.vector.reduce.fmax.v4f32(<4 x float>) +declare float @llvm.vector.reduce.fmax.v8f32(<8 x float>) +declare double @llvm.vector.reduce.fmax.v2f64(<2 x double>) +declare double @llvm.vector.reduce.fmax.v4f64(<4 x double>) +declare fp128 @llvm.vector.reduce.fmax.v4f128(<4 x fp128>) + +declare i64 @llvm.vector.reduce.umin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umin.v4i64(<4 x i64>) +declare i32 @llvm.vector.reduce.umin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umin.v8i32(<8 x i32>) +declare i8 @llvm.vector.reduce.umin.v128i8(<128 x i8>) +declare i128 @llvm.vector.reduce.umin.v4i128(<4 x i128>) + +declare i64 @llvm.vector.reduce.umax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.umax.v4i64(<4 x i64>) +declare i32 @llvm.vector.reduce.umax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.umax.v8i32(<8 x i32>) +declare i8 @llvm.vector.reduce.umax.v128i8(<128 x i8>) +declare i128 @llvm.vector.reduce.umax.v4i128(<4 x i128>) + +declare i64 @llvm.vector.reduce.smin.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smin.v4i64(<4 x i64>) +declare i32 @llvm.vector.reduce.smin.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smin.v8i32(<8 x i32>) +declare i8 @llvm.vector.reduce.smin.v128i8(<128 x i8>) +declare i128 @llvm.vector.reduce.smin.v4i128(<4 x i128>) + +declare i64 @llvm.vector.reduce.smax.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.smax.v4i64(<4 x i64>) +declare i32 @llvm.vector.reduce.smax.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.smax.v8i32(<8 x i32>) +declare i8 @llvm.vector.reduce.smax.v128i8(<128 x i8>) +declare i128 @llvm.vector.reduce.smax.v4i128(<4 x i128>) + +declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.add.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.add.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.add.v16i64(<16 x i64>) +declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.add.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.add.v16i32(<16 x i32>) +declare i16 @llvm.vector.reduce.add.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.add.v16i16(<16 x i16>) +declare i8 @llvm.vector.reduce.add.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.add.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.add.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.add.v16i8(<16 x i8>) + +declare i8 @llvm.vector.reduce.add.v128i8(<128 x i8>) +declare i256 @llvm.vector.reduce.add.v4i256(<4 x i256>) + +declare i64 @llvm.vector.reduce.mul.v2i64(<2 x i64>) +declare i64 @llvm.vector.reduce.mul.v4i64(<4 x i64>) +declare i64 @llvm.vector.reduce.mul.v8i64(<8 x i64>) +declare i64 @llvm.vector.reduce.mul.v16i64(<16 x i64>) +declare i32 @llvm.vector.reduce.mul.v2i32(<2 x i32>) +declare i32 @llvm.vector.reduce.mul.v4i32(<4 x i32>) +declare i32 @llvm.vector.reduce.mul.v8i32(<8 x i32>) +declare i32 @llvm.vector.reduce.mul.v16i32(<16 x i32>) +declare i16 @llvm.vector.reduce.mul.v2i16(<2 x i16>) +declare i16 @llvm.vector.reduce.mul.v4i16(<4 x i16>) +declare i16 @llvm.vector.reduce.mul.v8i16(<8 x i16>) +declare i16 @llvm.vector.reduce.mul.v16i16(<16 x i16>) +declare i8 @llvm.vector.reduce.mul.v2i8(<2 x i8>) +declare i8 @llvm.vector.reduce.mul.v4i8(<4 x i8>) +declare i8 @llvm.vector.reduce.mul.v8i8(<8 x i8>) +declare i8 @llvm.vector.reduce.mul.v16i8(<16 x i8>) + +declare i8 @llvm.vector.reduce.mul.v128i8(<128 x i8>) +declare i256 @llvm.vector.reduce.mul.v4i256(<4 x i256>) diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll new file mode 100644 index 0000000000000..fa0587f1da931 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fadd.ll @@ -0,0 +1,188 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \ +; RUN: | FileCheck %s +; +; Test vectorization and reassociation of fadd operations. If the loads can +; be vectorized, cases of fewer operands are also profitable to vectorize. + +define double @fadd_double_4_addends_seq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fadd_double_4_addends_seq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> [[TMP0]]) +; CHECK-NEXT: ret double [[TMP1]] +; +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + ret double %add5 +} + +define double @fadd_double_8_addends_nonseq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fadd_double_8_addends_nonseq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10 +; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fadd.v8f64(double 0.000000e+00, <8 x double> [[TMP15]]) +; CHECK-NEXT: ret double [[TMP16]] +; +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %add = fadd reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %add3 = fadd reassoc nsz arcp contract afn double %add, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %add5 = fadd reassoc nsz arcp contract afn double %add3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %add7 = fadd reassoc nsz arcp contract afn double %add5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %add9 = fadd reassoc nsz arcp contract afn double %add7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12 + %6 = load double, ptr %arrayidx10, align 8 + %add11 = fadd reassoc nsz arcp contract afn double %add9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14 + %7 = load double, ptr %arrayidx12, align 8 + %add13 = fadd reassoc nsz arcp contract afn double %add11, %7 + ret double %add13 +} + +define float @fadd_float_16_addends_nonseq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define float @fadd_float_16_addends_nonseq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[X]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4 +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15 +; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP31]]) +; CHECK-NEXT: ret float [[TMP32]] +; +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %add = fadd reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %add3 = fadd reassoc nsz arcp contract afn float %add, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %add5 = fadd reassoc nsz arcp contract afn float %add3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %add7 = fadd reassoc nsz arcp contract afn float %add5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %add9 = fadd reassoc nsz arcp contract afn float %add7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12 + %6 = load float, ptr %arrayidx10, align 4 + %add11 = fadd reassoc nsz arcp contract afn float %add9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14 + %7 = load float, ptr %arrayidx12, align 4 + %add13 = fadd reassoc nsz arcp contract afn float %add11, %7 + %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16 + %8 = load float, ptr %arrayidx14, align 4 + %add15 = fadd reassoc nsz arcp contract afn float %add13, %8 + %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18 + %9 = load float, ptr %arrayidx16, align 4 + %add17 = fadd reassoc nsz arcp contract afn float %add15, %9 + %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20 + %10 = load float, ptr %arrayidx18, align 4 + %add19 = fadd reassoc nsz arcp contract afn float %add17, %10 + %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22 + %11 = load float, ptr %arrayidx20, align 4 + %add21 = fadd reassoc nsz arcp contract afn float %add19, %11 + %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24 + %12 = load float, ptr %arrayidx22, align 4 + %add23 = fadd reassoc nsz arcp contract afn float %add21, %12 + %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26 + %13 = load float, ptr %arrayidx24, align 4 + %add25 = fadd reassoc nsz arcp contract afn float %add23, %13 + %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28 + %14 = load float, ptr %arrayidx26, align 4 + %add27 = fadd reassoc nsz arcp contract afn float %add25, %14 + %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30 + %15 = load float, ptr %arrayidx28, align 4 + %add29 = fadd reassoc nsz arcp contract afn float %add27, %15 + ret float %add29 +} diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll new file mode 100644 index 0000000000000..5ea777e1c9a10 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmin-fmax.ll @@ -0,0 +1,411 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \ +; RUN: | FileCheck %s + +; Test vectorization and reassociation of fmin/fmax operations. Vectorization +; is more profitable if the loads are also vectorizable. + +define double @fmin_double_4_nums_seq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fmin_double_4_nums_seq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %g1 = getelementptr inbounds double, ptr %x, i64 1 + %g2 = getelementptr inbounds double, ptr %x, i64 2 + %g3 = getelementptr inbounds double, ptr %x, i64 3 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2) + ret double %m3 +} + +define double @fmin_double_16_nums_nonseq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fmin_double_16_nums_nonseq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2 +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4 +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6 +; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8 +; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10 +; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12 +; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14 +; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16 +; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18 +; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20 +; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22 +; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24 +; CHECK-NEXT: [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26 +; CHECK-NEXT: [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28 +; CHECK-NEXT: [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30 +; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[X]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4 +; CHECK-NEXT: [[T4:%.*]] = load double, ptr [[G4]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load double, ptr [[G5]], align 4 +; CHECK-NEXT: [[T6:%.*]] = load double, ptr [[G6]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load double, ptr [[G7]], align 4 +; CHECK-NEXT: [[T8:%.*]] = load double, ptr [[G8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load double, ptr [[G9]], align 4 +; CHECK-NEXT: [[T10:%.*]] = load double, ptr [[G10]], align 4 +; CHECK-NEXT: [[T11:%.*]] = load double, ptr [[G11]], align 4 +; CHECK-NEXT: [[T12:%.*]] = load double, ptr [[G12]], align 4 +; CHECK-NEXT: [[T13:%.*]] = load double, ptr [[G13]], align 4 +; CHECK-NEXT: [[T14:%.*]] = load double, ptr [[G14]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load double, ptr [[G15]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15 +; CHECK-NEXT: [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmin.v16f64(<16 x double> [[TMP16]]) +; CHECK-NEXT: ret double [[TMP17]] +; + %g1 = getelementptr inbounds double, ptr %x, i64 2 + %g2 = getelementptr inbounds double, ptr %x, i64 4 + %g3 = getelementptr inbounds double, ptr %x, i64 6 + %g4 = getelementptr inbounds double, ptr %x, i64 8 + %g5 = getelementptr inbounds double, ptr %x, i64 10 + %g6 = getelementptr inbounds double, ptr %x, i64 12 + %g7 = getelementptr inbounds double, ptr %x, i64 14 + %g8 = getelementptr inbounds double, ptr %x, i64 16 + %g9 = getelementptr inbounds double, ptr %x, i64 18 + %g10 = getelementptr inbounds double, ptr %x, i64 20 + %g11 = getelementptr inbounds double, ptr %x, i64 22 + %g12 = getelementptr inbounds double, ptr %x, i64 24 + %g13 = getelementptr inbounds double, ptr %x, i64 26 + %g14 = getelementptr inbounds double, ptr %x, i64 28 + %g15 = getelementptr inbounds double, ptr %x, i64 30 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %t4 = load double, ptr %g4, align 4 + %t5 = load double, ptr %g5, align 4 + %t6 = load double, ptr %g6, align 4 + %t7 = load double, ptr %g7, align 4 + %t8 = load double, ptr %g8, align 4 + %t9 = load double, ptr %g9, align 4 + %t10 = load double, ptr %g10, align 4 + %t11 = load double, ptr %g11, align 4 + %t12 = load double, ptr %g12, align 4 + %t13 = load double, ptr %g13, align 4 + %t14 = load double, ptr %g14, align 4 + %t15 = load double, ptr %g15, align 4 + %m1 = tail call fast double @llvm.minnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.minnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.minnum.f64(double %t3, double %m2) + %m4 = tail call fast double @llvm.minnum.f64(double %t4, double %m3) + %m5 = tail call fast double @llvm.minnum.f64(double %t5, double %m4) + %m6 = tail call fast double @llvm.minnum.f64(double %t6, double %m5) + %m7 = tail call fast double @llvm.minnum.f64(double %t7, double %m6) + %m8 = tail call fast double @llvm.minnum.f64(double %t8, double %m7) + %m9 = tail call fast double @llvm.minnum.f64(double %t9, double %m8) + %m10 = tail call fast double @llvm.minnum.f64(double %t10, double %m9) + %m11 = tail call fast double @llvm.minnum.f64(double %t11, double %m10) + %m12 = tail call fast double @llvm.minnum.f64(double %t12, double %m11) + %m13 = tail call fast double @llvm.minnum.f64(double %t13, double %m12) + %m14 = tail call fast double @llvm.minnum.f64(double %t14, double %m13) + %m15 = tail call fast double @llvm.minnum.f64(double %t15, double %m14) + ret double %m15 +} + +define float @fmin_float_12_nums_nonseq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define float @fmin_float_12_nums_nonseq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2 +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4 +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6 +; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8 +; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10 +; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12 +; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14 +; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16 +; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18 +; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20 +; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22 +; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[X]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4 +; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4 +; CHECK-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4 +; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[G8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load float, ptr [[G9]], align 4 +; CHECK-NEXT: [[T10:%.*]] = load float, ptr [[G10]], align 4 +; CHECK-NEXT: [[T11:%.*]] = load float, ptr [[G11]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x float> poison, float [[T1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <12 x float> [[TMP1]], float [[T0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <12 x float> [[TMP2]], float [[T2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <12 x float> [[TMP3]], float [[T3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <12 x float> [[TMP4]], float [[T4]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <12 x float> [[TMP5]], float [[T5]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <12 x float> [[TMP6]], float [[T6]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <12 x float> [[TMP7]], float [[T7]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <12 x float> [[TMP8]], float [[T8]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <12 x float> [[TMP9]], float [[T9]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <12 x float> [[TMP10]], float [[T10]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <12 x float> [[TMP11]], float [[T11]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmin.v12f32(<12 x float> [[TMP12]]) +; CHECK-NEXT: ret float [[TMP13]] +; + %g1 = getelementptr inbounds float, ptr %x, i64 2 + %g2 = getelementptr inbounds float, ptr %x, i64 4 + %g3 = getelementptr inbounds float, ptr %x, i64 6 + %g4 = getelementptr inbounds float, ptr %x, i64 8 + %g5 = getelementptr inbounds float, ptr %x, i64 10 + %g6 = getelementptr inbounds float, ptr %x, i64 12 + %g7 = getelementptr inbounds float, ptr %x, i64 14 + %g8 = getelementptr inbounds float, ptr %x, i64 16 + %g9 = getelementptr inbounds float, ptr %x, i64 18 + %g10 = getelementptr inbounds float, ptr %x, i64 20 + %g11 = getelementptr inbounds float, ptr %x, i64 22 + %t0 = load float, ptr %x, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %t4 = load float, ptr %g4, align 4 + %t5 = load float, ptr %g5, align 4 + %t6 = load float, ptr %g6, align 4 + %t7 = load float, ptr %g7, align 4 + %t8 = load float, ptr %g8, align 4 + %t9 = load float, ptr %g9, align 4 + %t10 = load float, ptr %g10, align 4 + %t11 = load float, ptr %g11, align 4 + %m1 = tail call fast float @llvm.minnum.f32(float %t1, float %t0) + %m2 = tail call fast float @llvm.minnum.f32(float %t2, float %m1) + %m3 = tail call fast float @llvm.minnum.f32(float %t3, float %m2) + %m4 = tail call fast float @llvm.minnum.f32(float %t4, float %m3) + %m5 = tail call fast float @llvm.minnum.f32(float %t5, float %m4) + %m6 = tail call fast float @llvm.minnum.f32(float %t6, float %m5) + %m7 = tail call fast float @llvm.minnum.f32(float %t7, float %m6) + %m8 = tail call fast float @llvm.minnum.f32(float %t8, float %m7) + %m9 = tail call fast float @llvm.minnum.f32(float %t9, float %m8) + %m10 = tail call fast float @llvm.minnum.f32(float %t10, float %m9) + %m11 = tail call fast float @llvm.minnum.f32(float %t11, float %m10) + ret float %m11 +} + +define double @fmax_double_4_nums_seq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fmax_double_4_nums_seq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr [[X]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP1]]) +; CHECK-NEXT: ret double [[TMP2]] +; + %g1 = getelementptr inbounds double, ptr %x, i64 1 + %g2 = getelementptr inbounds double, ptr %x, i64 2 + %g3 = getelementptr inbounds double, ptr %x, i64 3 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2) + ret double %m3 +} + +define double @fmax_double_16_nums_nonseq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fmax_double_16_nums_nonseq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2 +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4 +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6 +; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8 +; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10 +; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12 +; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14 +; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 16 +; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds double, ptr [[X]], i64 18 +; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 20 +; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 22 +; CHECK-NEXT: [[G12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 24 +; CHECK-NEXT: [[G13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 26 +; CHECK-NEXT: [[G14:%.*]] = getelementptr inbounds double, ptr [[X]], i64 28 +; CHECK-NEXT: [[G15:%.*]] = getelementptr inbounds double, ptr [[X]], i64 30 +; CHECK-NEXT: [[T0:%.*]] = load double, ptr [[X]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load double, ptr [[G1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load double, ptr [[G2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load double, ptr [[G3]], align 4 +; CHECK-NEXT: [[T4:%.*]] = load double, ptr [[G4]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load double, ptr [[G5]], align 4 +; CHECK-NEXT: [[T6:%.*]] = load double, ptr [[G6]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load double, ptr [[G7]], align 4 +; CHECK-NEXT: [[T8:%.*]] = load double, ptr [[G8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load double, ptr [[G9]], align 4 +; CHECK-NEXT: [[T10:%.*]] = load double, ptr [[G10]], align 4 +; CHECK-NEXT: [[T11:%.*]] = load double, ptr [[G11]], align 4 +; CHECK-NEXT: [[T12:%.*]] = load double, ptr [[G12]], align 4 +; CHECK-NEXT: [[T13:%.*]] = load double, ptr [[G13]], align 4 +; CHECK-NEXT: [[T14:%.*]] = load double, ptr [[G14]], align 4 +; CHECK-NEXT: [[T15:%.*]] = load double, ptr [[G15]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x double> poison, double [[T1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x double> [[TMP1]], double [[T0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <16 x double> [[TMP2]], double [[T2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <16 x double> [[TMP3]], double [[T3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <16 x double> [[TMP4]], double [[T4]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <16 x double> [[TMP5]], double [[T5]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <16 x double> [[TMP6]], double [[T6]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <16 x double> [[TMP7]], double [[T7]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <16 x double> [[TMP8]], double [[T8]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <16 x double> [[TMP9]], double [[T9]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <16 x double> [[TMP10]], double [[T10]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <16 x double> [[TMP11]], double [[T11]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <16 x double> [[TMP12]], double [[T12]], i32 12 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <16 x double> [[TMP13]], double [[T13]], i32 13 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <16 x double> [[TMP14]], double [[T14]], i32 14 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x double> [[TMP15]], double [[T15]], i32 15 +; CHECK-NEXT: [[TMP17:%.*]] = call fast double @llvm.vector.reduce.fmax.v16f64(<16 x double> [[TMP16]]) +; CHECK-NEXT: ret double [[TMP17]] +; + %g1 = getelementptr inbounds double, ptr %x, i64 2 + %g2 = getelementptr inbounds double, ptr %x, i64 4 + %g3 = getelementptr inbounds double, ptr %x, i64 6 + %g4 = getelementptr inbounds double, ptr %x, i64 8 + %g5 = getelementptr inbounds double, ptr %x, i64 10 + %g6 = getelementptr inbounds double, ptr %x, i64 12 + %g7 = getelementptr inbounds double, ptr %x, i64 14 + %g8 = getelementptr inbounds double, ptr %x, i64 16 + %g9 = getelementptr inbounds double, ptr %x, i64 18 + %g10 = getelementptr inbounds double, ptr %x, i64 20 + %g11 = getelementptr inbounds double, ptr %x, i64 22 + %g12 = getelementptr inbounds double, ptr %x, i64 24 + %g13 = getelementptr inbounds double, ptr %x, i64 26 + %g14 = getelementptr inbounds double, ptr %x, i64 28 + %g15 = getelementptr inbounds double, ptr %x, i64 30 + %t0 = load double, ptr %x, align 4 + %t1 = load double, ptr %g1, align 4 + %t2 = load double, ptr %g2, align 4 + %t3 = load double, ptr %g3, align 4 + %t4 = load double, ptr %g4, align 4 + %t5 = load double, ptr %g5, align 4 + %t6 = load double, ptr %g6, align 4 + %t7 = load double, ptr %g7, align 4 + %t8 = load double, ptr %g8, align 4 + %t9 = load double, ptr %g9, align 4 + %t10 = load double, ptr %g10, align 4 + %t11 = load double, ptr %g11, align 4 + %t12 = load double, ptr %g12, align 4 + %t13 = load double, ptr %g13, align 4 + %t14 = load double, ptr %g14, align 4 + %t15 = load double, ptr %g15, align 4 + %m1 = tail call fast double @llvm.maxnum.f64(double %t1, double %t0) + %m2 = tail call fast double @llvm.maxnum.f64(double %t2, double %m1) + %m3 = tail call fast double @llvm.maxnum.f64(double %t3, double %m2) + %m4 = tail call fast double @llvm.maxnum.f64(double %t4, double %m3) + %m5 = tail call fast double @llvm.maxnum.f64(double %t5, double %m4) + %m6 = tail call fast double @llvm.maxnum.f64(double %t6, double %m5) + %m7 = tail call fast double @llvm.maxnum.f64(double %t7, double %m6) + %m8 = tail call fast double @llvm.maxnum.f64(double %t8, double %m7) + %m9 = tail call fast double @llvm.maxnum.f64(double %t9, double %m8) + %m10 = tail call fast double @llvm.maxnum.f64(double %t10, double %m9) + %m11 = tail call fast double @llvm.maxnum.f64(double %t11, double %m10) + %m12 = tail call fast double @llvm.maxnum.f64(double %t12, double %m11) + %m13 = tail call fast double @llvm.maxnum.f64(double %t13, double %m12) + %m14 = tail call fast double @llvm.maxnum.f64(double %t14, double %m13) + %m15 = tail call fast double @llvm.maxnum.f64(double %t15, double %m14) + ret double %m15 +} + +define float @fmax_float_12_nums_nonseq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define float @fmax_float_12_nums_nonseq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2 +; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4 +; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6 +; CHECK-NEXT: [[G4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8 +; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10 +; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12 +; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14 +; CHECK-NEXT: [[G8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16 +; CHECK-NEXT: [[G9:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18 +; CHECK-NEXT: [[G10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20 +; CHECK-NEXT: [[G11:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22 +; CHECK-NEXT: [[T0:%.*]] = load float, ptr [[X]], align 4 +; CHECK-NEXT: [[T1:%.*]] = load float, ptr [[G1]], align 4 +; CHECK-NEXT: [[T2:%.*]] = load float, ptr [[G2]], align 4 +; CHECK-NEXT: [[T3:%.*]] = load float, ptr [[G3]], align 4 +; CHECK-NEXT: [[T4:%.*]] = load float, ptr [[G4]], align 4 +; CHECK-NEXT: [[T5:%.*]] = load float, ptr [[G5]], align 4 +; CHECK-NEXT: [[T6:%.*]] = load float, ptr [[G6]], align 4 +; CHECK-NEXT: [[T7:%.*]] = load float, ptr [[G7]], align 4 +; CHECK-NEXT: [[T8:%.*]] = load float, ptr [[G8]], align 4 +; CHECK-NEXT: [[T9:%.*]] = load float, ptr [[G9]], align 4 +; CHECK-NEXT: [[T10:%.*]] = load float, ptr [[G10]], align 4 +; CHECK-NEXT: [[T11:%.*]] = load float, ptr [[G11]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <12 x float> poison, float [[T1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <12 x float> [[TMP1]], float [[T0]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <12 x float> [[TMP2]], float [[T2]], i32 2 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <12 x float> [[TMP3]], float [[T3]], i32 3 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <12 x float> [[TMP4]], float [[T4]], i32 4 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <12 x float> [[TMP5]], float [[T5]], i32 5 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <12 x float> [[TMP6]], float [[T6]], i32 6 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <12 x float> [[TMP7]], float [[T7]], i32 7 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <12 x float> [[TMP8]], float [[T8]], i32 8 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <12 x float> [[TMP9]], float [[T9]], i32 9 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <12 x float> [[TMP10]], float [[T10]], i32 10 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <12 x float> [[TMP11]], float [[T11]], i32 11 +; CHECK-NEXT: [[TMP13:%.*]] = call fast float @llvm.vector.reduce.fmax.v12f32(<12 x float> [[TMP12]]) +; CHECK-NEXT: ret float [[TMP13]] +; + %g1 = getelementptr inbounds float, ptr %x, i64 2 + %g2 = getelementptr inbounds float, ptr %x, i64 4 + %g3 = getelementptr inbounds float, ptr %x, i64 6 + %g4 = getelementptr inbounds float, ptr %x, i64 8 + %g5 = getelementptr inbounds float, ptr %x, i64 10 + %g6 = getelementptr inbounds float, ptr %x, i64 12 + %g7 = getelementptr inbounds float, ptr %x, i64 14 + %g8 = getelementptr inbounds float, ptr %x, i64 16 + %g9 = getelementptr inbounds float, ptr %x, i64 18 + %g10 = getelementptr inbounds float, ptr %x, i64 20 + %g11 = getelementptr inbounds float, ptr %x, i64 22 + %t0 = load float, ptr %x, align 4 + %t1 = load float, ptr %g1, align 4 + %t2 = load float, ptr %g2, align 4 + %t3 = load float, ptr %g3, align 4 + %t4 = load float, ptr %g4, align 4 + %t5 = load float, ptr %g5, align 4 + %t6 = load float, ptr %g6, align 4 + %t7 = load float, ptr %g7, align 4 + %t8 = load float, ptr %g8, align 4 + %t9 = load float, ptr %g9, align 4 + %t10 = load float, ptr %g10, align 4 + %t11 = load float, ptr %g11, align 4 + %m1 = tail call fast float @llvm.maxnum.f32(float %t1, float %t0) + %m2 = tail call fast float @llvm.maxnum.f32(float %t2, float %m1) + %m3 = tail call fast float @llvm.maxnum.f32(float %t3, float %m2) + %m4 = tail call fast float @llvm.maxnum.f32(float %t4, float %m3) + %m5 = tail call fast float @llvm.maxnum.f32(float %t5, float %m4) + %m6 = tail call fast float @llvm.maxnum.f32(float %t6, float %m5) + %m7 = tail call fast float @llvm.maxnum.f32(float %t7, float %m6) + %m8 = tail call fast float @llvm.maxnum.f32(float %t8, float %m7) + %m9 = tail call fast float @llvm.maxnum.f32(float %t9, float %m8) + %m10 = tail call fast float @llvm.maxnum.f32(float %t10, float %m9) + %m11 = tail call fast float @llvm.maxnum.f32(float %t11, float %m10) + ret float %m11 +} + +declare float @llvm.minnum.f32(float, float) +declare double @llvm.minnum.f64(double, double) +declare float @llvm.maxnum.f32(float, float) +declare double @llvm.maxnum.f64(double, double) diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll new file mode 100644 index 0000000000000..e08b38c69a840 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/reductions-fmul.ll @@ -0,0 +1,188 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=s390x-unknown-linux -mcpu=z15 -passes=slp-vectorizer %s -S -o - \ +; RUN: | FileCheck %s + +; Test vectorization and reassociation of fmul operations. If the loads can +; be vectorized, cases of fewer operands are also profitable to vectorize. + +define double @fmul_double_4_factors_seq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fmul_double_4_factors_seq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x double>, ptr [[X]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v4f64(double 1.000000e+00, <4 x double> [[TMP0]]) +; CHECK-NEXT: ret double [[TMP1]] +; +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 1 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 2 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 3 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + ret double %mul5 +} + +define double @fmul_double_8_factors_nonseq(ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define double @fmul_double_8_factors_nonseq( +; CHECK-SAME: ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load double, ptr [[X]], align 8 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds double, ptr [[X]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load double, ptr [[ARRAYIDX1]], align 8 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds double, ptr [[X]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load double, ptr [[ARRAYIDX2]], align 8 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[X]], i64 6 +; CHECK-NEXT: [[TMP3:%.*]] = load double, ptr [[ARRAYIDX4]], align 8 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds double, ptr [[X]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = load double, ptr [[ARRAYIDX6]], align 8 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds double, ptr [[X]], i64 10 +; CHECK-NEXT: [[TMP5:%.*]] = load double, ptr [[ARRAYIDX8]], align 8 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds double, ptr [[X]], i64 12 +; CHECK-NEXT: [[TMP6:%.*]] = load double, ptr [[ARRAYIDX10]], align 8 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds double, ptr [[X]], i64 14 +; CHECK-NEXT: [[TMP7:%.*]] = load double, ptr [[ARRAYIDX12]], align 8 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <8 x double> poison, double [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <8 x double> [[TMP8]], double [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <8 x double> [[TMP9]], double [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP11:%.*]] = insertelement <8 x double> [[TMP10]], double [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP12:%.*]] = insertelement <8 x double> [[TMP11]], double [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP13:%.*]] = insertelement <8 x double> [[TMP12]], double [[TMP5]], i32 5 +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <8 x double> [[TMP13]], double [[TMP6]], i32 6 +; CHECK-NEXT: [[TMP15:%.*]] = insertelement <8 x double> [[TMP14]], double [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP16:%.*]] = call reassoc nsz arcp contract afn double @llvm.vector.reduce.fmul.v8f64(double 1.000000e+00, <8 x double> [[TMP15]]) +; CHECK-NEXT: ret double [[TMP16]] +; +entry: + %0 = load double, ptr %x, align 8 + %arrayidx1 = getelementptr inbounds double, ptr %x, i64 2 + %1 = load double, ptr %arrayidx1, align 8 + %mul = fmul reassoc nsz arcp contract afn double %1, %0 + %arrayidx2 = getelementptr inbounds double, ptr %x, i64 4 + %2 = load double, ptr %arrayidx2, align 8 + %mul3 = fmul reassoc nsz arcp contract afn double %mul, %2 + %arrayidx4 = getelementptr inbounds double, ptr %x, i64 6 + %3 = load double, ptr %arrayidx4, align 8 + %mul5 = fmul reassoc nsz arcp contract afn double %mul3, %3 + %arrayidx6 = getelementptr inbounds double, ptr %x, i64 8 + %4 = load double, ptr %arrayidx6, align 8 + %mul7 = fmul reassoc nsz arcp contract afn double %mul5, %4 + %arrayidx8 = getelementptr inbounds double, ptr %x, i64 10 + %5 = load double, ptr %arrayidx8, align 8 + %mul9 = fmul reassoc nsz arcp contract afn double %mul7, %5 + %arrayidx10 = getelementptr inbounds double, ptr %x, i64 12 + %6 = load double, ptr %arrayidx10, align 8 + %mul11 = fmul reassoc nsz arcp contract afn double %mul9, %6 + %arrayidx12 = getelementptr inbounds double, ptr %x, i64 14 + %7 = load double, ptr %arrayidx12, align 8 + %mul13 = fmul reassoc nsz arcp contract afn double %mul11, %7 + ret double %mul13 +} + +define float @fmul_float_16_factors_nonseq(float noundef %m, ptr nocapture noundef readonly %x) { +; CHECK-LABEL: define float @fmul_float_16_factors_nonseq( +; CHECK-SAME: float noundef [[M:%.*]], ptr nocapture noundef readonly [[X:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = load float, ptr [[X]], align 4 +; CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds float, ptr [[X]], i64 2 +; CHECK-NEXT: [[TMP1:%.*]] = load float, ptr [[ARRAYIDX1]], align 4 +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds float, ptr [[X]], i64 4 +; CHECK-NEXT: [[TMP2:%.*]] = load float, ptr [[ARRAYIDX2]], align 4 +; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[X]], i64 6 +; CHECK-NEXT: [[TMP3:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 +; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds float, ptr [[X]], i64 8 +; CHECK-NEXT: [[TMP4:%.*]] = load float, ptr [[ARRAYIDX6]], align 4 +; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds float, ptr [[X]], i64 10 +; CHECK-NEXT: [[TMP5:%.*]] = load float, ptr [[ARRAYIDX8]], align 4 +; CHECK-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds float, ptr [[X]], i64 12 +; CHECK-NEXT: [[TMP6:%.*]] = load float, ptr [[ARRAYIDX10]], align 4 +; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds float, ptr [[X]], i64 14 +; CHECK-NEXT: [[TMP7:%.*]] = load float, ptr [[ARRAYIDX12]], align 4 +; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds float, ptr [[X]], i64 16 +; CHECK-NEXT: [[TMP8:%.*]] = load float, ptr [[ARRAYIDX14]], align 4 +; CHECK-NEXT: [[ARRAYIDX16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 18 +; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX16]], align 4 +; CHECK-NEXT: [[ARRAYIDX18:%.*]] = getelementptr inbounds float, ptr [[X]], i64 20 +; CHECK-NEXT: [[TMP10:%.*]] = load float, ptr [[ARRAYIDX18]], align 4 +; CHECK-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds float, ptr [[X]], i64 22 +; CHECK-NEXT: [[TMP11:%.*]] = load float, ptr [[ARRAYIDX20]], align 4 +; CHECK-NEXT: [[ARRAYIDX22:%.*]] = getelementptr inbounds float, ptr [[X]], i64 24 +; CHECK-NEXT: [[TMP12:%.*]] = load float, ptr [[ARRAYIDX22]], align 4 +; CHECK-NEXT: [[ARRAYIDX24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 26 +; CHECK-NEXT: [[TMP13:%.*]] = load float, ptr [[ARRAYIDX24]], align 4 +; CHECK-NEXT: [[ARRAYIDX26:%.*]] = getelementptr inbounds float, ptr [[X]], i64 28 +; CHECK-NEXT: [[TMP14:%.*]] = load float, ptr [[ARRAYIDX26]], align 4 +; CHECK-NEXT: [[ARRAYIDX28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30 +; CHECK-NEXT: [[TMP15:%.*]] = load float, ptr [[ARRAYIDX28]], align 4 +; CHECK-NEXT: [[TMP16:%.*]] = insertelement <16 x float> poison, float [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <16 x float> [[TMP16]], float [[TMP0]], i32 1 +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <16 x float> [[TMP17]], float [[TMP2]], i32 2 +; CHECK-NEXT: [[TMP19:%.*]] = insertelement <16 x float> [[TMP18]], float [[TMP3]], i32 3 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <16 x float> [[TMP19]], float [[TMP4]], i32 4 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <16 x float> [[TMP20]], float [[TMP5]], i32 5 +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <16 x float> [[TMP21]], float [[TMP6]], i32 6 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <16 x float> [[TMP22]], float [[TMP7]], i32 7 +; CHECK-NEXT: [[TMP24:%.*]] = insertelement <16 x float> [[TMP23]], float [[TMP8]], i32 8 +; CHECK-NEXT: [[TMP25:%.*]] = insertelement <16 x float> [[TMP24]], float [[TMP9]], i32 9 +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <16 x float> [[TMP25]], float [[TMP10]], i32 10 +; CHECK-NEXT: [[TMP27:%.*]] = insertelement <16 x float> [[TMP26]], float [[TMP11]], i32 11 +; CHECK-NEXT: [[TMP28:%.*]] = insertelement <16 x float> [[TMP27]], float [[TMP12]], i32 12 +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <16 x float> [[TMP28]], float [[TMP13]], i32 13 +; CHECK-NEXT: [[TMP30:%.*]] = insertelement <16 x float> [[TMP29]], float [[TMP14]], i32 14 +; CHECK-NEXT: [[TMP31:%.*]] = insertelement <16 x float> [[TMP30]], float [[TMP15]], i32 15 +; CHECK-NEXT: [[TMP32:%.*]] = call reassoc nsz arcp contract afn float @llvm.vector.reduce.fmul.v16f32(float 1.000000e+00, <16 x float> [[TMP31]]) +; CHECK-NEXT: ret float [[TMP32]] +; +entry: + %0 = load float, ptr %x, align 4 + %arrayidx1 = getelementptr inbounds float, ptr %x, i64 2 + %1 = load float, ptr %arrayidx1, align 4 + %mul = fmul reassoc nsz arcp contract afn float %1, %0 + %arrayidx2 = getelementptr inbounds float, ptr %x, i64 4 + %2 = load float, ptr %arrayidx2, align 4 + %mul3 = fmul reassoc nsz arcp contract afn float %mul, %2 + %arrayidx4 = getelementptr inbounds float, ptr %x, i64 6 + %3 = load float, ptr %arrayidx4, align 4 + %mul5 = fmul reassoc nsz arcp contract afn float %mul3, %3 + %arrayidx6 = getelementptr inbounds float, ptr %x, i64 8 + %4 = load float, ptr %arrayidx6, align 4 + %mul7 = fmul reassoc nsz arcp contract afn float %mul5, %4 + %arrayidx8 = getelementptr inbounds float, ptr %x, i64 10 + %5 = load float, ptr %arrayidx8, align 4 + %mul9 = fmul reassoc nsz arcp contract afn float %mul7, %5 + %arrayidx10 = getelementptr inbounds float, ptr %x, i64 12 + %6 = load float, ptr %arrayidx10, align 4 + %mul11 = fmul reassoc nsz arcp contract afn float %mul9, %6 + %arrayidx12 = getelementptr inbounds float, ptr %x, i64 14 + %7 = load float, ptr %arrayidx12, align 4 + %mul13 = fmul reassoc nsz arcp contract afn float %mul11, %7 + %arrayidx14 = getelementptr inbounds float, ptr %x, i64 16 + %8 = load float, ptr %arrayidx14, align 4 + %mul15 = fmul reassoc nsz arcp contract afn float %mul13, %8 + %arrayidx16 = getelementptr inbounds float, ptr %x, i64 18 + %9 = load float, ptr %arrayidx16, align 4 + %mul17 = fmul reassoc nsz arcp contract afn float %mul15, %9 + %arrayidx18 = getelementptr inbounds float, ptr %x, i64 20 + %10 = load float, ptr %arrayidx18, align 4 + %mul19 = fmul reassoc nsz arcp contract afn float %mul17, %10 + %arrayidx20 = getelementptr inbounds float, ptr %x, i64 22 + %11 = load float, ptr %arrayidx20, align 4 + %mul21 = fmul reassoc nsz arcp contract afn float %mul19, %11 + %arrayidx22 = getelementptr inbounds float, ptr %x, i64 24 + %12 = load float, ptr %arrayidx22, align 4 + %mul23 = fmul reassoc nsz arcp contract afn float %mul21, %12 + %arrayidx24 = getelementptr inbounds float, ptr %x, i64 26 + %13 = load float, ptr %arrayidx24, align 4 + %mul25 = fmul reassoc nsz arcp contract afn float %mul23, %13 + %arrayidx26 = getelementptr inbounds float, ptr %x, i64 28 + %14 = load float, ptr %arrayidx26, align 4 + %mul27 = fmul reassoc nsz arcp contract afn float %mul25, %14 + %arrayidx28 = getelementptr inbounds float, ptr %x, i64 30 + %15 = load float, ptr %arrayidx28, align 4 + %mul29 = fmul reassoc nsz arcp contract afn float %mul27, %15 + ret float %mul29 +}