From 5a987ef2cd7261c2d10af41571bcb93ef09d3528 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Thu, 10 Apr 2025 21:01:03 -0700 Subject: [PATCH 1/3] [SLP][REVEC] Pre-commit test. --- .../revec-reduced-value-vectorized-later.ll | 88 +++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll new file mode 100644 index 0000000000000..656f216b0093e --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll @@ -0,0 +1,88 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -slp-revec -slp-max-reg-size=1024 -slp-threshold=-100 < %s | FileCheck %s + +define <4 x i16> @test() { +; CHECK-LABEL: define <4 x i16> @test() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> poison, <4 x i16> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.vector.insert.v8i16.v4i16(<8 x i16> [[TMP0]], <4 x i16> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i16> [[TMP1]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> poison, <4 x i16> zeroinitializer, i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> poison, <16 x i32> +; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP4]], <4 x i16> zeroinitializer, i64 4) +; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP6]], <4 x i16> zeroinitializer, i64 8) +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v4i16(<16 x i16> [[TMP7]], <4 x i16> zeroinitializer, i64 12) +; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]] +; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]] +; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP13:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP12]]) +; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0 +; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP16:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP15]]) +; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[TMP16]], i64 1 +; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP18]]) +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i64 2 +; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP21]]) +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP22]], i64 3 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]]) +; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0 +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]]) +; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1 +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]]) +; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2 +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]]) +; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3 +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]]) +; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0 +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]]) +; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1 +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]]) +; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2 +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]]) +; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3 +; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP35]], [[TMP47]] +; CHECK-NEXT: [[OP_RDX10:%.*]] = or <4 x i16> [[OP_RDX9]], zeroinitializer +; CHECK-NEXT: [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX10]], [[TMP23]] +; CHECK-NEXT: ret <4 x i16> [[OP_RDX11]] +; +entry: + %subi = add <4 x i16> zeroinitializer, zeroinitializer + %sub40.i = add <4 x i16> %subi, zeroinitializer + %sub41.i = add <4 x i16> %subi, zeroinitializer + %sub42.i = add <4 x i16> %subi, zeroinitializer + %sub43.i = add <4 x i16> %subi, zeroinitializer + %sub44.i = add <4 x i16> %subi, zeroinitializer + %sub45.i = add <4 x i16> %subi, zeroinitializer + %sub46.i = add <4 x i16> zeroinitializer, zeroinitializer + %sub47.i = add <4 x i16> zeroinitializer, zeroinitializer + %sub48.i = add <4 x i16> zeroinitializer, zeroinitializer + %sub49.i = add <4 x i16> zeroinitializer, zeroinitializer + %or40.i = or <4 x i16> %sub40.i, %sub41.i + %or41.i = or <4 x i16> %or40.i, %sub42.i + %or42.i = or <4 x i16> %or41.i, %sub43.i + %or43.i = or <4 x i16> %or42.i, %sub44.i + %or44.i = or <4 x i16> %or43.i, %sub45.i + %or45.i = or <4 x i16> %or44.i, %sub46.i + %or46.i = or <4 x i16> %or45.i, %sub47.i + %or47.i = or <4 x i16> %or46.i, %sub48.i + %or48.i = or <4 x i16> %or47.i, %sub49.i + %or50.i = or <4 x i16> %or48.i, %subi + %subii = add <4 x i16> zeroinitializer, zeroinitializer + %subi16.i = add <4 x i16> %subii, zeroinitializer + %subi17.i = add <4 x i16> %subii, zeroinitializer + %0 = or <4 x i16> %subi16.i, %subi17.i + %1 = or <4 x i16> %0, %or50.i + ret <4 x i16> %1 +} From 6897f7d36de0f4b15c21db04b35dc1836845a13d Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Tue, 15 Apr 2025 01:07:59 -0700 Subject: [PATCH 2/3] [SLP][REVEC] VectorValuesAndScales should be supported by REVEC. We should align REVEC with the SLP algorithm as closely as possible. For example, by applying REVEC-specific handling when calling IRBuilder's Create methods, performing cost analysis via TTI, and expanding shuffle masks using transformScalarShuffleIndicesToVector. reference commit: 3b18d47ecbaba4e519ebf0d1bc134a404a56a9da --- .../Transforms/Vectorize/SLPVectorizer.cpp | 137 ++++++++---------- .../SLPVectorizer/SystemZ/revec-fix-128169.ll | 8 +- .../revec-reduced-value-vectorized-later.ll | 34 ++--- 3 files changed, 79 insertions(+), 100 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index a0837ab214219..b8007b7d8fb4d 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22499,53 +22499,16 @@ class HorizontalReduction { } Type *ScalarTy = VL.front()->getType(); - if (isa(ScalarTy)) { - assert(SLPReVec && "FixedVectorType is not expected."); - unsigned ScalarTyNumElements = getNumElements(ScalarTy); - Value *ReducedSubTree = PoisonValue::get( - getWidenedType(ScalarTy->getScalarType(), ScalarTyNumElements)); - for (unsigned I : seq(ScalarTyNumElements)) { - // Do reduction for each lane. - // e.g., do reduce add for - // VL[0] = <4 x Ty> - // VL[1] = <4 x Ty> - // Lane[0] = <2 x Ty> - // Lane[1] = <2 x Ty> - // Lane[2] = <2 x Ty> - // Lane[3] = <2 x Ty> - // result[0] = reduce add Lane[0] - // result[1] = reduce add Lane[1] - // result[2] = reduce add Lane[2] - // result[3] = reduce add Lane[3] - SmallVector Mask = - createStrideMask(I, ScalarTyNumElements, VL.size()); - Value *Lane = Builder.CreateShuffleVector(VectorizedRoot, Mask); - Value *Val = - createSingleOp(Builder, *TTI, Lane, - OptReusedScalars && SameScaleFactor - ? SameValuesCounter.front().second - : 1, - Lane->getType()->getScalarType() != - VL.front()->getType()->getScalarType() - ? V.isSignedMinBitwidthRootNode() - : true, - RdxRootInst->getType()); - ReducedSubTree = - Builder.CreateInsertElement(ReducedSubTree, Val, I); - } - VectorizedTree = GetNewVectorizedTree(VectorizedTree, ReducedSubTree); - } else { - Type *VecTy = VectorizedRoot->getType(); - Type *RedScalarTy = VecTy->getScalarType(); - VectorValuesAndScales.emplace_back( - VectorizedRoot, - OptReusedScalars && SameScaleFactor - ? SameValuesCounter.front().second - : 1, - RedScalarTy != ScalarTy->getScalarType() - ? V.isSignedMinBitwidthRootNode() - : true); - } + Type *VecTy = VectorizedRoot->getType(); + Type *RedScalarTy = VecTy->getScalarType(); + VectorValuesAndScales.emplace_back( + VectorizedRoot, + OptReusedScalars && SameScaleFactor + ? SameValuesCounter.front().second + : 1, + RedScalarTy != ScalarTy->getScalarType() + ? V.isSignedMinBitwidthRootNode() + : true); // Count vectorized reduced values to exclude them from final reduction. for (Value *RdxVal : VL) { @@ -22718,9 +22681,35 @@ class HorizontalReduction { Value *createSingleOp(IRBuilderBase &Builder, const TargetTransformInfo &TTI, Value *Vec, unsigned Scale, bool IsSigned, Type *DestTy) { - Value *Rdx = emitReduction(Vec, Builder, &TTI, DestTy); - if (Rdx->getType() != DestTy->getScalarType()) - Rdx = Builder.CreateIntCast(Rdx, DestTy->getScalarType(), IsSigned); + Value *Rdx; + if (auto *VecTy = dyn_cast(DestTy)) { + unsigned DestTyNumElements = VecTy->getNumElements(); + unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements; + Rdx = PoisonValue::get( + getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements)); + for (unsigned I : seq(DestTyNumElements)) { + // Do reduction for each lane. + // e.g., do reduce add for + // VL[0] = <4 x Ty> + // VL[1] = <4 x Ty> + // Lane[0] = <2 x Ty> + // Lane[1] = <2 x Ty> + // Lane[2] = <2 x Ty> + // Lane[3] = <2 x Ty> + // result[0] = reduce add Lane[0] + // result[1] = reduce add Lane[1] + // result[2] = reduce add Lane[2] + // result[3] = reduce add Lane[3] + SmallVector Mask = createStrideMask(I, DestTyNumElements, VF); + Value *Lane = Builder.CreateShuffleVector(Vec, Mask); + Rdx = Builder.CreateInsertElement( + Rdx, emitReduction(Lane, Builder, &TTI, DestTy), I); + } + } else { + Rdx = emitReduction(Vec, Builder, &TTI, DestTy); + } + if (Rdx->getType() != DestTy) + Rdx = Builder.CreateIntCast(Rdx, DestTy, IsSigned); // Improved analysis for add/fadd/xor reductions with same scale // factor for all operands of reductions. We can emit scalar ops for // them instead. @@ -22787,30 +22776,32 @@ class HorizontalReduction { case RecurKind::FMul: { unsigned RdxOpcode = RecurrenceDescriptor::getOpcode(RdxKind); if (!AllConsts) { - if (auto *VecTy = dyn_cast(ScalarTy)) { - assert(SLPReVec && "FixedVectorType is not expected."); - unsigned ScalarTyNumElements = VecTy->getNumElements(); - for (unsigned I : seq(ReducedVals.size())) { - VectorCost += TTI->getShuffleCost( - TTI::SK_PermuteSingleSrc, VectorTy, - createStrideMask(I, ScalarTyNumElements, ReducedVals.size())); - VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, FMF, - CostKind); - } - VectorCost += TTI->getScalarizationOverhead( - VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true, - /*Extract*/ false, TTI::TCK_RecipThroughput); - } else if (DoesRequireReductionOp) { - Type *RedTy = VectorTy->getElementType(); - auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( - std::make_pair(RedTy, true)); - if (RType == RedTy) { - VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, - FMF, CostKind); + if (DoesRequireReductionOp) { + if (auto *VecTy = dyn_cast(ScalarTy)) { + assert(SLPReVec && "FixedVectorType is not expected."); + unsigned ScalarTyNumElements = VecTy->getNumElements(); + for (unsigned I : seq(ReducedVals.size())) { + VectorCost += TTI->getShuffleCost( + TTI::SK_PermuteSingleSrc, VectorTy, + createStrideMask(I, ScalarTyNumElements, ReducedVals.size())); + VectorCost += TTI->getArithmeticReductionCost(RdxOpcode, VecTy, + FMF, CostKind); + } + VectorCost += TTI->getScalarizationOverhead( + VecTy, APInt::getAllOnes(ScalarTyNumElements), /*Insert*/ true, + /*Extract*/ false, TTI::TCK_RecipThroughput); } else { - VectorCost = TTI->getExtendedReductionCost( - RdxOpcode, !IsSigned, RedTy, getWidenedType(RType, ReduxWidth), - FMF, CostKind); + Type *RedTy = VectorTy->getElementType(); + auto [RType, IsSigned] = R.getRootNodeTypeWithNoCast().value_or( + std::make_pair(RedTy, true)); + if (RType == RedTy) { + VectorCost = TTI->getArithmeticReductionCost(RdxOpcode, VectorTy, + FMF, CostKind); + } else { + VectorCost = TTI->getExtendedReductionCost( + RdxOpcode, !IsSigned, RedTy, + getWidenedType(RType, ReduxWidth), FMF, CostKind); + } } } else { Type *RedTy = VectorTy->getElementType(); diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll index b9f35451b02ae..1dd6c7b81fb73 100644 --- a/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll +++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/revec-fix-128169.ll @@ -44,16 +44,16 @@ define void @e(<4 x i16> %0) { ; THRESH-NEXT: [[TMP13:%.*]] = icmp sgt <16 x i16> [[TMP12]], [[TMP7]] ; THRESH-NEXT: [[TMP14:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP15:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP14]]) +; THRESH-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i64 0 ; THRESH-NEXT: [[TMP16:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP16]]) +; THRESH-NEXT: [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP17]], i64 1 ; THRESH-NEXT: [[TMP18:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP19:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP18]]) +; THRESH-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP19]], i64 2 ; THRESH-NEXT: [[TMP20:%.*]] = shufflevector <16 x i1> [[TMP13]], <16 x i1> poison, <4 x i32> ; THRESH-NEXT: [[TMP21:%.*]] = call i1 @llvm.vector.reduce.or.v4i1(<4 x i1> [[TMP20]]) -; THRESH-NEXT: [[TMP22:%.*]] = insertelement <4 x i1> poison, i1 [[TMP15]], i32 0 -; THRESH-NEXT: [[TMP23:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP17]], i32 1 -; THRESH-NEXT: [[TMP24:%.*]] = insertelement <4 x i1> [[TMP23]], i1 [[TMP19]], i32 2 -; THRESH-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> [[TMP24]], i1 [[TMP21]], i32 3 +; THRESH-NEXT: [[TMP25:%.*]] = insertelement <4 x i1> [[TMP22]], i1 [[TMP21]], i64 3 ; THRESH-NEXT: [[TMP26]] = zext <4 x i1> [[TMP25]] to <4 x i32> ; THRESH-NEXT: br label [[VECTOR_BODY]] ; diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll index 656f216b0093e..3d0e6be661fd1 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll @@ -16,45 +16,33 @@ define <4 x i16> @test() { ; CHECK-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP5]], [[TMP8]] ; CHECK-NEXT: [[TMP10:%.*]] = add <16 x i16> [[TMP8]], [[TMP8]] ; CHECK-NEXT: [[TMP11:%.*]] = add <16 x i16> [[TMP3]], [[TMP8]] -; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP13:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP12]]) -; CHECK-NEXT: [[TMP14:%.*]] = insertelement <4 x i16> poison, i16 [[TMP13]], i64 0 -; CHECK-NEXT: [[TMP15:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP16:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP15]]) -; CHECK-NEXT: [[TMP17:%.*]] = insertelement <4 x i16> [[TMP14]], i16 [[TMP16]], i64 1 -; CHECK-NEXT: [[TMP18:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP19:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP18]]) -; CHECK-NEXT: [[TMP20:%.*]] = insertelement <4 x i16> [[TMP17]], i16 [[TMP19]], i64 2 -; CHECK-NEXT: [[TMP21:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> -; CHECK-NEXT: [[TMP22:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP21]]) -; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i16> [[TMP20]], i16 [[TMP22]], i64 3 -; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP25:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP24]]) ; CHECK-NEXT: [[TMP26:%.*]] = insertelement <4 x i16> poison, i16 [[TMP25]], i64 0 -; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP27:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP28:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP27]]) ; CHECK-NEXT: [[TMP29:%.*]] = insertelement <4 x i16> [[TMP26]], i16 [[TMP28]], i64 1 -; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP31:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP30]]) ; CHECK-NEXT: [[TMP32:%.*]] = insertelement <4 x i16> [[TMP29]], i16 [[TMP31]], i64 2 -; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP11]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP33:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP34:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP33]]) ; CHECK-NEXT: [[TMP35:%.*]] = insertelement <4 x i16> [[TMP32]], i16 [[TMP34]], i64 3 -; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> [[TMP11]], [[TMP9]] +; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]]) ; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0 -; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]]) ; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1 -; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]]) ; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2 -; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP9]], <16 x i16> poison, <4 x i32> +; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> ; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]]) ; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3 -; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP35]], [[TMP47]] -; CHECK-NEXT: [[OP_RDX10:%.*]] = or <4 x i16> [[OP_RDX9]], zeroinitializer -; CHECK-NEXT: [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX10]], [[TMP23]] +; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer +; CHECK-NEXT: [[OP_RDX11:%.*]] = or <4 x i16> [[OP_RDX9]], [[TMP35]] ; CHECK-NEXT: ret <4 x i16> [[OP_RDX11]] ; entry: From 760efb5473dc81374371ed78e02edcc19c2dcc25 Mon Sep 17 00:00:00 2001 From: Han-Kuan Chen Date: Tue, 15 Apr 2025 07:25:10 -0700 Subject: [PATCH 3/3] apply comment --- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index b8007b7d8fb4d..43466b5bb707b 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -22683,7 +22683,7 @@ class HorizontalReduction { Type *DestTy) { Value *Rdx; if (auto *VecTy = dyn_cast(DestTy)) { - unsigned DestTyNumElements = VecTy->getNumElements(); + unsigned DestTyNumElements = getNumElements(VecTy); unsigned VF = getNumElements(Vec->getType()) / DestTyNumElements; Rdx = PoisonValue::get( getWidenedType(Vec->getType()->getScalarType(), DestTyNumElements));