diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a0837ab214219..43466b5bb707b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22499,53 +22499,16 @@ class HorizontalReduction { } Type *ScalarTy = VL.front()->getType(); - if (isa(ScalarTy)) { - assert(SLPReVec && "FixedVectorType is not expected."); - unsigned ScalarTyNumElements = getNumElements(ScalarTy); - Value *ReducedSubTree = PoisonValue::get( - getWidenedType(ScalarTy->getScalarType(), ScalarTyNumElements)); - for (unsigned I : seq(ScalarTyNumElements)) { - // Do reduction for each lane. - // e.g., do reduce add for - // VL[0] = <4 x Ty> - // VL[1] = <4 x Ty> - // Lane[0] = <2 x Ty> - // Lane[1] = <2 x Ty> - // Lane[2] = <2 x Ty> - // Lane[3] = <2 x Ty> - // result[0] = reduce add Lane[0] - // result[1] = reduce add Lane[1] - // result[2] = reduce add Lane[2] - // result[3] = reduce add Lane[3] - SmallVector Mask = - createStrideMask(I, ScalarTyNumElements, VL.size()); - Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask); - Value *Val = - createSingleOp(Builder, *TTI, Lane, - OptReusedScalars && SameScaleFactor - ? SameValuesCounter.front().second - : 1, - Lane->getType()->getScalarType() != - VL.front()->getType()->getScalarType() - ? V.isSignedMinBitwidthRootNode() - : true, - RdxRootInst->getType()); - ReducedSubTree = - Builder.CreateInsertElement(ReducedSubTree, Val, I); - } - VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); - } else { - Type *VecTy = VectorizedRoot->getType(); - Type *RedScalarTy = VecTy->getScalarType(); - VectorValuesAndScales.emplace_back( - VectorizedRoot, - OptReusedScalars && SameScaleFactor - ? SameValuesCounter.front().second - : 1, - RedScalarTy != ScalarTy->getScalarType() - ? V.isSignedMinBitwidthRootNode() - : true); - } + Type *VecTy = VectorizedRoot->getType(); + Type *RedScalarTy = VecTy->getScalarType(); + VectorValuesAndScales.emplace_back( + VectorizedRoot, + OptReusedScalars && SameScaleFactor + ? SameValuesCounter.front().second + : 1, + RedScalarTy != ScalarTy->getScalarType() + ? V.isSignedMinBitwidthRootNode() + : true); // Count vectorized reduced values to exclude them from final reduction. for (Value *RdxVal : VL) { @@ -22718,9 +22681,35 @@ class HorizontalReduction { Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI, Value *Vec, unsigned Scale, bool IsSigned, Type *DestTy) { - Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy); - if (Rdx->getType() != DestTy->getScalarType()) - Rdx = Builder.CreateIntCast(Rdx, DestTy->getScalarType(), IsSigned); + Value *Rdx; + if (auto *VecTy = dyn_cast(DestTy)) { + unsigned DestTyNumElements = getNumElements(VecTy); + unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements; + Rdx = PoisonValue::get( + getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements)); + for (unsigned I : seq(DestTyNumElements)) { + // Do reduction for each lane. + // e.g., do reduce add for + // VL[0] = <4 x Ty> + // VL[1] = <4 x Ty> + // Lane[0] = <2 x Ty> + // Lane[1] = <2 x Ty> + // Lane[2] = <2 x Ty> + // Lane[3] = <2 x Ty> + // result[0] = reduce add Lane[0] + // result[1] = reduce add Lane[1] + // result[2] = reduce add Lane[2] + // result[3] = reduce add Lane[3] + SmallVector Mask = createStrideMask(I, DestTyNumElements, VF); + Value *Lane = Builder.CreateShuffleVector(Vec, Mask); + Rdx = Builder.CreateInsertElement( + Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I); + } + } else { + Rdx = emitReduction(Vec, Builder, &TTI, DestTy); + } + if (Rdx->getType() != DestTy) + Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned); // Improved analysis for add/fadd/xor reductions with same scale // factor for all operands of reductions. We can emit scalar ops for // them instead. @@ -22787,30 +22776,32 @@ class HorizontalReduction { case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); if (!AllConsts) { - if (auto *VecTy = dyn_cast(ScalarTy)) { - assert(SLPReVec && "FixedVectorType is not expected."); - unsigned ScalarTyNumElements = VecTy->getNumElements(); - for (unsigned I : seq(ReducedVals.size())) { - VectorCost += TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, VectorTy, - createStrideMask(I, ScalarTyNumElements, ReducedVals.size())); - VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF, - CostKind); - } - VectorCost += TTI->getScalarizationOverhead( - VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true, - /*Extract*/ false, TTI::TCK_RecipThroughput); - } else if (DoesRequireReductionOp) { - Type *RedTy = VectorTy->getElementType(); - auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( - std::make_pair(RedTy, true)); - if (RType == RedTy) { - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, - FMF, CostKind); + if (DoesRequireReductionOp) { + if (auto *VecTy = dyn_cast(ScalarTy)) { + assert(SLPReVec && "FixedVectorType is not expected."); + unsigned ScalarTyNumElements = VecTy->getNumElements(); + for (unsigned I : seq(ReducedVals.size())) { + VectorCost += TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, VectorTy, + createStrideMask(I, ScalarTyNumElements, ReducedVals.size())); + VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, + FMF, CostKind); + } + VectorCost += TTI->getScalarizationOverhead( + VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true, + /*Extract*/ false, TTI::TCK_RecipThroughput); } else { - VectorCost = TTI->getExtendedReductionCost( - RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth), - FMF, CostKind); + Type *RedTy = VectorTy->getElementType(); + auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( + std::make_pair(RedTy, true)); + if (RType == RedTy) { + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, + FMF, CostKind); + } else { + VectorCost = TTI->getExtendedReductionCost( + RdxOpcode, !IsSigned, RedTy, + getWidenedType(RType, ReduxWidth), FMF, CostKind); + } } } else { Type *RedTy = VectorTy->getElementType(); diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll index b9f35451b02ae..1dd6c7b81fb73 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll @@ -44,16 +44,16 @@ define void @e(<4 x i16> %0) { ; THRESH-NEXT: [[TMP13:%.*]] = icmp sgt <16 x i16> [[TMP12]], [[TMP7]] ; THRESH-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) +; THRESH-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i64 0 ; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]]) +; THRESH-NEXT: [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP17]], i64 1 ; THRESH-NEXT: [[TMP18:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]]) +; THRESH-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP19]], i64 2 ; THRESH-NEXT: [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP20]]) -; THRESH-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i32 0 -; THRESH-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP17]], i32 1 -; THRESH-NEXT: [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP19]], i32 2 -; THRESH-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP21]], i32 3 +; THRESH-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP21]], i64 3 ; THRESH-NEXT: [[TMP26]] = zext <4 x i1> [[TMP25]] to <4 x i32> ; THRESH-NEXT: br label [[VECTOR_BODY]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll new file mode 100644 index 0000000000000..3d0e6be661fd1 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 < %s | FileCheck %s + +define <4 x i16> @test() { +; CHECK-LABEL: define <4 x i16> @test() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP4]], <4 x i16> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP6]], <4 x i16> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP7]], <4 x i16> zeroinitializer, i64 12) +; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]] +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]]) +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]]) +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1 +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]]) +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2 +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]]) +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3 +; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]]) +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0 +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]]) +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1 +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]]) +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2 +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]]) +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3 +; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer +; CHECK-NEXT: [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX9]], [[TMP35]] +; CHECK-NEXT: ret <4 x i16> [[OP_RDX11]] +; +entry: + %subi = add <4 x i16> zeroinitializer, zeroinitializer + %sub40.i = add <4 x i16> %subi, zeroinitializer + %sub41.i = add <4 x i16> %subi, zeroinitializer + %sub42.i = add <4 x i16> %subi, zeroinitializer + %sub43.i = add <4 x i16> %subi, zeroinitializer + %sub44.i = add <4 x i16> %subi, zeroinitializer + %sub45.i = add <4 x i16> %subi, zeroinitializer + %sub46.i = add <4 x i16> zeroinitializer, zeroinitializer + %sub47.i = add <4 x i16> zeroinitializer, zeroinitializer + %sub48.i = add <4 x i16> zeroinitializer, zeroinitializer + %sub49.i = add <4 x i16> zeroinitializer, zeroinitializer + %or40.i = or <4 x i16> %sub40.i, %sub41.i + %or41.i = or <4 x i16> %or40.i, %sub42.i + %or42.i = or <4 x i16> %or41.i, %sub43.i + %or43.i = or <4 x i16> %or42.i, %sub44.i + %or44.i = or <4 x i16> %or43.i, %sub45.i + %or45.i = or <4 x i16> %or44.i, %sub46.i + %or46.i = or <4 x i16> %or45.i, %sub47.i + %or47.i = or <4 x i16> %or46.i, %sub48.i + %or48.i = or <4 x i16> %or47.i, %sub49.i + %or50.i = or <4 x i16> %or48.i, %subi + %subii = add <4 x i16> zeroinitializer, zeroinitializer + %subi16.i = add <4 x i16> %subii, zeroinitializer + %subi17.i = add <4 x i16> %subii, zeroinitializer + %0 = or <4 x i16> %subi16.i, %subi17.i + %1 = or <4 x i16> %0, %or50.i + ret <4 x i16> %1 +}