From 8bbd7f9a0ffbef95beedb66756c5d6eab560510a Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Fri, 25 Apr 2025 12:11:10 -0700 Subject: [PATCH 1/3] [SLPVectorizer] Use accurate cost for external users of resize shuffles Change-Id: I7620d2bd3d65be994bd290b84267832fdb4f1bb4 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 54 +++++--- .../SLPVectorizer/AMDGPU/external-shuffle.ll | 128 ++++++------------ .../SLPVectorizer/X86/buildvector-shuffle.ll | 9 +- .../extractelement-single-use-many-nodes.ll | 11 +- .../X86/insertelements-with-reused-indices.ll | 10 +- 5 files changed, 96 insertions(+), 116 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index c3ca22dce0cc4..fd58da64901f3 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14886,25 +14886,47 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, Cost += ExtractCost; auto &&ResizeToVF = [this, &Cost](const TreeEntry *TE, ArrayRef Mask, - bool) { + bool ForSingleMask) { InstructionCost C = 0; unsigned VF = Mask.size(); unsigned VecVF = TE->getVectorFactor(); - if (VF != VecVF && - (any_of(Mask, [VF](int Idx) { return Idx >= static_cast(VF); }) || - !ShuffleVectorInst::isIdentityMask(Mask, VF))) { - SmallVector OrigMask(VecVF, PoisonMaskElem); - std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), - OrigMask.begin()); - C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, - getWidenedType(TE->getMainOp()->getType(), VecVF), - OrigMask); - LLVM_DEBUG( - dbgs() << "SLP: Adding cost " << C - << " for final shuffle of insertelement external users.\n"; - TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); - Cost += C; - return std::make_pair(TE, true); + bool HasLargeIndex = + any_of(Mask, [VF](int Idx) { return Idx >= static_cast(VF); }); + if ((VF != VecVF && HasLargeIndex) || + !ShuffleVectorInst::isIdentityMask(Mask, VF)) { + + if (HasLargeIndex) { + SmallVector OrigMask(VecVF, PoisonMaskElem); + std::copy(Mask.begin(), std::next(Mask.begin(), std::min(VF, VecVF)), + OrigMask.begin()); + C = ::getShuffleCost(*TTI, TTI::SK_PermuteSingleSrc, + getWidenedType(TE->getMainOp()->getType(), VecVF), + OrigMask); + LLVM_DEBUG( + dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users.\n"; + TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); + Cost += C; + return std::make_pair(TE, true); + } + + if (!ForSingleMask) { + SmallVector ResizeMask(VF, PoisonMaskElem); + for (unsigned I = 0; I < VF; ++I) { + if (Mask[I] != PoisonMaskElem) + ResizeMask[Mask[I]] = Mask[I]; + } + if (!ShuffleVectorInst::isIdentityMask(Mask, VF)) + C = ::getShuffleCost( + *TTI, TTI::SK_PermuteSingleSrc, + getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask); + LLVM_DEBUG( + dbgs() << "SLP: Adding cost " << C + << " for final shuffle of insertelement external users.\n"; + TE->dump(); dbgs() << "SLP: Current total cost = " << Cost << "\n"); + + Cost += C; + } } return std::make_pair(TE, false); }; diff --git a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll index ce9e47a03dee3..f3e89b60b8045 100644 --- a/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/AMDGPU/external-shuffle.ll @@ -10,124 +10,84 @@ define void @phi_4(ptr addrspace(3) %inptr0, ptr addrspace(3) %inptr1, ptr %out, ; GCN-NEXT: [[ENTRY:.*]]: ; GCN-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8 ; GCN-NEXT: [[GEP2:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 2 -; GCN-NEXT: [[GEP3:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 3 ; GCN-NEXT: [[TMP1:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2 ; GCN-NEXT: [[GEP4:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 4 -; GCN-NEXT: [[GEP5:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 5 ; GCN-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8 ; GCN-NEXT: [[GEP6:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 6 -; GCN-NEXT: [[GEP7:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 7 ; GCN-NEXT: [[TMP3:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2 ; GCN-NEXT: [[GEP8:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 8 -; GCN-NEXT: [[GEP9:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 9 ; GCN-NEXT: [[TMP4:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8 ; GCN-NEXT: [[GEP10:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 10 -; GCN-NEXT: [[GEP11:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 11 ; GCN-NEXT: [[TMP5:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2 ; GCN-NEXT: [[GEP12:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 12 -; GCN-NEXT: [[GEP13:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 13 ; GCN-NEXT: [[TMP6:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8 ; GCN-NEXT: [[GEP14:%.*]] = getelementptr i16, ptr addrspace(3) [[INPTR0]], i32 14 ; GCN-NEXT: [[TMP7:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2 -; GCN-NEXT: [[TMP8:%.*]] = extractelement <2 x i16> [[TMP1]], i32 0 -; GCN-NEXT: [[TMP9:%.*]] = extractelement <2 x i16> [[TMP1]], i32 1 -; GCN-NEXT: [[TMP10:%.*]] = extractelement <2 x i16> [[TMP2]], i32 0 -; GCN-NEXT: [[TMP11:%.*]] = extractelement <2 x i16> [[TMP2]], i32 1 -; GCN-NEXT: [[TMP12:%.*]] = extractelement <2 x i16> [[TMP3]], i32 0 -; GCN-NEXT: [[TMP13:%.*]] = extractelement <2 x i16> [[TMP3]], i32 1 -; GCN-NEXT: [[TMP14:%.*]] = extractelement <2 x i16> [[TMP4]], i32 0 -; GCN-NEXT: [[TMP15:%.*]] = extractelement <2 x i16> [[TMP4]], i32 1 -; GCN-NEXT: [[TMP24:%.*]] = extractelement <2 x i16> [[TMP5]], i32 0 -; GCN-NEXT: [[TMP26:%.*]] = extractelement <2 x i16> [[TMP5]], i32 1 -; GCN-NEXT: [[TMP28:%.*]] = extractelement <2 x i16> [[TMP6]], i32 0 -; GCN-NEXT: [[TMP38:%.*]] = extractelement <2 x i16> [[TMP6]], i32 1 ; GCN-NEXT: br label %[[DO_BODY:.*]] ; GCN: [[DO_BODY]]: -; GCN-NEXT: [[PHI2:%.*]] = phi i16 [ [[TMP8]], %[[ENTRY]] ], [ [[TMP30:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI3:%.*]] = phi i16 [ [[TMP9]], %[[ENTRY]] ], [ [[OTHERELE3:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI4:%.*]] = phi i16 [ [[TMP10]], %[[ENTRY]] ], [ [[TMP39:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI5:%.*]] = phi i16 [ [[TMP11]], %[[ENTRY]] ], [ [[OTHERELE5:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI6:%.*]] = phi i16 [ [[TMP12]], %[[ENTRY]] ], [ [[TMP32:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI7:%.*]] = phi i16 [ [[TMP13]], %[[ENTRY]] ], [ [[OTHERELE7:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI8:%.*]] = phi i16 [ [[TMP14]], %[[ENTRY]] ], [ [[TMP40:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI9:%.*]] = phi i16 [ [[TMP15]], %[[ENTRY]] ], [ [[OTHERELE9:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI10:%.*]] = phi i16 [ [[TMP24]], %[[ENTRY]] ], [ [[TMP34:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI11:%.*]] = phi i16 [ [[TMP26]], %[[ENTRY]] ], [ [[OTHERELE11:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI12:%.*]] = phi i16 [ [[TMP28]], %[[ENTRY]] ], [ [[TMP35:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[PHI13:%.*]] = phi i16 [ [[TMP38]], %[[ENTRY]] ], [ [[OTHERELE13:%.*]], %[[DO_BODY]] ] -; GCN-NEXT: [[TMP41:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP8:%.*]] = phi <2 x i16> [ [[TMP0]], %[[ENTRY]] ], [ [[TMP16:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP9:%.*]] = phi <2 x i16> [ [[TMP1]], %[[ENTRY]] ], [ [[TMP17:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP10:%.*]] = phi <2 x i16> [ [[TMP2]], %[[ENTRY]] ], [ [[TMP18:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP11:%.*]] = phi <2 x i16> [ [[TMP3]], %[[ENTRY]] ], [ [[TMP19:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP12:%.*]] = phi <2 x i16> [ [[TMP4]], %[[ENTRY]] ], [ [[TMP20:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP13:%.*]] = phi <2 x i16> [ [[TMP5]], %[[ENTRY]] ], [ [[TMP21:%.*]], %[[DO_BODY]] ] +; GCN-NEXT: [[TMP14:%.*]] = phi <2 x i16> [ [[TMP6]], %[[ENTRY]] ], [ [[TMP22:%.*]], %[[DO_BODY]] ] ; GCN-NEXT: [[TMP42:%.*]] = phi <2 x i16> [ [[TMP7]], %[[ENTRY]] ], [ [[TMP23:%.*]], %[[DO_BODY]] ] ; GCN-NEXT: [[TMP16]] = load <2 x i16>, ptr addrspace(3) [[INPTR0]], align 8 -; GCN-NEXT: [[OTHERELE3]] = load i16, ptr addrspace(3) [[GEP3]], align 1 -; GCN-NEXT: [[TMP17:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2 -; GCN-NEXT: [[OTHERELE5]] = load i16, ptr addrspace(3) [[GEP5]], align 1 -; GCN-NEXT: [[TMP18:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8 -; GCN-NEXT: [[OTHERELE7]] = load i16, ptr addrspace(3) [[GEP7]], align 1 -; GCN-NEXT: [[TMP19:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2 -; GCN-NEXT: [[OTHERELE9]] = load i16, ptr addrspace(3) [[GEP9]], align 1 -; GCN-NEXT: [[TMP20:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8 -; GCN-NEXT: [[OTHERELE11]] = load i16, ptr addrspace(3) [[GEP11]], align 1 -; GCN-NEXT: [[TMP21:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2 -; GCN-NEXT: [[OTHERELE13]] = load i16, ptr addrspace(3) [[GEP13]], align 1 -; GCN-NEXT: [[TMP22:%.*]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8 +; GCN-NEXT: [[TMP17]] = load <2 x i16>, ptr addrspace(3) [[GEP2]], align 2 +; GCN-NEXT: [[TMP18]] = load <2 x i16>, ptr addrspace(3) [[GEP4]], align 8 +; GCN-NEXT: [[TMP19]] = load <2 x i16>, ptr addrspace(3) [[GEP6]], align 2 +; GCN-NEXT: [[TMP20]] = load <2 x i16>, ptr addrspace(3) [[GEP8]], align 8 +; GCN-NEXT: [[TMP21]] = load <2 x i16>, ptr addrspace(3) [[GEP10]], align 2 +; GCN-NEXT: [[TMP22]] = load <2 x i16>, ptr addrspace(3) [[GEP12]], align 8 ; GCN-NEXT: [[TMP23]] = load <2 x i16>, ptr addrspace(3) [[GEP14]], align 2 ; GCN-NEXT: [[CMP:%.*]] = icmp eq i32 [[FLAG]], 0 -; GCN-NEXT: [[TMP30]] = extractelement <2 x i16> [[TMP17]], i32 0 -; GCN-NEXT: [[TMP39]] = extractelement <2 x i16> [[TMP18]], i32 0 -; GCN-NEXT: [[TMP32]] = extractelement <2 x i16> [[TMP19]], i32 0 -; GCN-NEXT: [[TMP40]] = extractelement <2 x i16> [[TMP20]], i32 0 -; GCN-NEXT: [[TMP34]] = extractelement <2 x i16> [[TMP21]], i32 0 -; GCN-NEXT: [[TMP35]] = extractelement <2 x i16> [[TMP22]], i32 0 ; GCN-NEXT: br i1 [[CMP]], label %[[EXIT:.*]], label %[[DO_BODY]] ; GCN: [[EXIT]]: -; GCN-NEXT: [[TMP36:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP17]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[VEC038:%.*]] = shufflevector <16 x i16> [[TMP36]], <16 x i16> [[TMP37]], <16 x i32> +; GCN-NEXT: [[TMP24:%.*]] = shufflevector <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], <16 x i32> ; GCN-NEXT: [[TMP25:%.*]] = shufflevector <2 x i16> [[TMP18]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[VEC059:%.*]] = shufflevector <16 x i16> [[VEC038]], <16 x i16> [[TMP25]], <16 x i32> +; GCN-NEXT: [[TMP26:%.*]] = shufflevector <16 x i16> [[TMP24]], <16 x i16> [[TMP25]], <16 x i32> ; GCN-NEXT: [[TMP27:%.*]] = shufflevector <2 x i16> [[TMP19]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[VEC0710:%.*]] = shufflevector <16 x i16> [[VEC059]], <16 x i16> [[TMP27]], <16 x i32> +; GCN-NEXT: [[TMP28:%.*]] = shufflevector <16 x i16> [[TMP26]], <16 x i16> [[TMP27]], <16 x i32> ; GCN-NEXT: [[TMP29:%.*]] = shufflevector <2 x i16> [[TMP20]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[VEC0911:%.*]] = shufflevector <16 x i16> [[VEC0710]], <16 x i16> [[TMP29]], <16 x i32> +; GCN-NEXT: [[TMP30:%.*]] = shufflevector <16 x i16> [[TMP28]], <16 x i16> [[TMP29]], <16 x i32> ; GCN-NEXT: [[TMP31:%.*]] = shufflevector <2 x i16> [[TMP21]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[VEC01112:%.*]] = shufflevector <16 x i16> [[VEC0911]], <16 x i16> [[TMP31]], <16 x i32> +; GCN-NEXT: [[TMP32:%.*]] = shufflevector <16 x i16> [[TMP30]], <16 x i16> [[TMP31]], <16 x i32> ; GCN-NEXT: [[TMP33:%.*]] = shufflevector <2 x i16> [[TMP22]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[TMP58:%.*]] = shufflevector <16 x i16> [[VEC01112]], <16 x i16> [[TMP33]], <16 x i32> -; GCN-NEXT: [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP47:%.*]] = shufflevector <16 x i16> [[TMP32]], <16 x i16> [[TMP33]], <16 x i32> +; GCN-NEXT: [[TMP48:%.*]] = shufflevector <2 x i16> [[TMP23]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP49:%.*]] = shufflevector <16 x i16> [[TMP47]], <16 x i16> [[TMP48]], <16 x i32> +; GCN-NEXT: [[TMP37:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <16 x i32> +; GCN-NEXT: [[TMP38:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[TMP37]], <16 x i16> [[TMP38]], <16 x i32> +; GCN-NEXT: [[TMP40:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP41:%.*]] = shufflevector <16 x i16> [[TMP39]], <16 x i16> [[TMP40]], <16 x i32> +; GCN-NEXT: [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP43:%.*]] = shufflevector <16 x i16> [[TMP41]], <16 x i16> [[TMP57]], <16 x i32> +; GCN-NEXT: [[TMP44:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[TMP43]], <16 x i16> [[TMP44]], <16 x i32> +; GCN-NEXT: [[TMP46:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP58:%.*]] = shufflevector <16 x i16> [[TMP45]], <16 x i16> [[TMP46]], <16 x i32> +; GCN-NEXT: [[TMP60:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC2157:%.*]] = shufflevector <16 x i16> [[TMP58]], <16 x i16> [[TMP60]], <16 x i32> -; GCN-NEXT: [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP0]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP1]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP50:%.*]] = shufflevector <2 x i16> [[TMP8]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP51:%.*]] = shufflevector <2 x i16> [[TMP9]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC231:%.*]] = shufflevector <16 x i16> [[TMP50]], <16 x i16> [[TMP51]], <16 x i32> -; GCN-NEXT: [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP52:%.*]] = shufflevector <2 x i16> [[TMP10]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC252:%.*]] = shufflevector <16 x i16> [[VEC231]], <16 x i16> [[TMP52]], <16 x i32> -; GCN-NEXT: [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP3]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP53:%.*]] = shufflevector <2 x i16> [[TMP11]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC273:%.*]] = shufflevector <16 x i16> [[VEC252]], <16 x i16> [[TMP53]], <16 x i32> -; GCN-NEXT: [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP54:%.*]] = shufflevector <2 x i16> [[TMP12]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC294:%.*]] = shufflevector <16 x i16> [[VEC273]], <16 x i16> [[TMP54]], <16 x i32> -; GCN-NEXT: [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP5]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP55:%.*]] = shufflevector <2 x i16> [[TMP13]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC2115:%.*]] = shufflevector <16 x i16> [[VEC294]], <16 x i16> [[TMP55]], <16 x i32> -; GCN-NEXT: [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP6]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP56:%.*]] = shufflevector <2 x i16> [[TMP14]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC2136:%.*]] = shufflevector <16 x i16> [[VEC2115]], <16 x i16> [[TMP56]], <16 x i32> -; GCN-NEXT: [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP7]], <2 x i16> poison, <16 x i32> +; GCN-NEXT: [[TMP59:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> ; GCN-NEXT: [[VEC2151:%.*]] = shufflevector <16 x i16> [[VEC2136]], <16 x i16> [[TMP59]], <16 x i32> -; GCN-NEXT: [[TMP57:%.*]] = shufflevector <2 x i16> [[TMP41]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[VEC22:%.*]] = insertelement <16 x i16> [[TMP57]], i16 [[PHI2]], i64 2 -; GCN-NEXT: [[VEC23:%.*]] = insertelement <16 x i16> [[VEC22]], i16 [[PHI3]], i64 3 -; GCN-NEXT: [[VEC24:%.*]] = insertelement <16 x i16> [[VEC23]], i16 [[PHI4]], i64 4 -; GCN-NEXT: [[VEC25:%.*]] = insertelement <16 x i16> [[VEC24]], i16 [[PHI5]], i64 5 -; GCN-NEXT: [[VEC26:%.*]] = insertelement <16 x i16> [[VEC25]], i16 [[PHI6]], i64 6 -; GCN-NEXT: [[VEC27:%.*]] = insertelement <16 x i16> [[VEC26]], i16 [[PHI7]], i64 7 -; GCN-NEXT: [[VEC28:%.*]] = insertelement <16 x i16> [[VEC27]], i16 [[PHI8]], i64 8 -; GCN-NEXT: [[VEC29:%.*]] = insertelement <16 x i16> [[VEC28]], i16 [[PHI9]], i64 9 -; GCN-NEXT: [[VEC210:%.*]] = insertelement <16 x i16> [[VEC29]], i16 [[PHI10]], i64 10 -; GCN-NEXT: [[VEC211:%.*]] = insertelement <16 x i16> [[VEC210]], i16 [[PHI11]], i64 11 -; GCN-NEXT: [[VEC212:%.*]] = insertelement <16 x i16> [[VEC211]], i16 [[PHI12]], i64 12 -; GCN-NEXT: [[VEC213:%.*]] = insertelement <16 x i16> [[VEC212]], i16 [[PHI13]], i64 13 -; GCN-NEXT: [[TMP61:%.*]] = shufflevector <2 x i16> [[TMP42]], <2 x i16> poison, <16 x i32> -; GCN-NEXT: [[VEC2152:%.*]] = shufflevector <16 x i16> [[VEC213]], <16 x i16> [[TMP61]], <16 x i32> -; GCN-NEXT: store <16 x i16> [[VEC2151]], ptr [[OUT]], align 32 -; GCN-NEXT: store <16 x i16> [[VEC2157]], ptr [[OUT1]], align 32 -; GCN-NEXT: store <16 x i16> [[VEC2152]], ptr [[OUT2]], align 32 +; GCN-NEXT: store <16 x i16> [[VEC2157]], ptr [[OUT]], align 32 +; GCN-NEXT: store <16 x i16> [[TMP49]], ptr [[OUT1]], align 32 +; GCN-NEXT: store <16 x i16> [[VEC2151]], ptr [[OUT2]], align 32 ; GCN-NEXT: ret void ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll index f8522bc546e6b..e4daba253b439 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll @@ -45,12 +45,13 @@ declare float @llvm.fmuladd.f32(float, float, float) define void @test(float %a) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[SHUFFLE]] -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[ADD_I157:%.*]] = fadd float 0.000000e+00, [[A:%.*]] +; CHECK-NEXT: [[ADD23_I:%.*]] = fadd float 0.000000e+00, [[A]] +; CHECK-NEXT: [[INSERT:%.*]] = insertelement <2 x float> zeroinitializer, float [[ADD_I157]], i64 0 +; CHECK-NEXT: [[INSERT_I:%.*]] = insertelement <2 x float> [[INSERT]], float [[ADD23_I]], i64 1 +; CHECK-NEXT: [[AGG:%.*]] = insertelement <2 x float> [[INSERT_I]], float [[ADD_I157]], i64 1 ; CHECK-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll index 28bab3276c47d..6942df532ae29 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/extractelement-single-use-many-nodes.ll @@ -7,9 +7,8 @@ define void @foo(double %i) { ; CHECK-NEXT: bb: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x double> , double [[I]], i32 2 ; CHECK-NEXT: [[TMP1:%.*]] = fsub <4 x double> zeroinitializer, [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[I]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = fsub <2 x double> zeroinitializer, [[TMP3]] ; CHECK-NEXT: [[I82:%.*]] = fsub double 0.000000e+00, poison +; CHECK-NEXT: [[I103:%.*]] = fsub double 0.000000e+00, [[I]] ; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <8 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x double> [[TMP8]], <8 x double> , <8 x i32> ; CHECK-NEXT: [[TMP6:%.*]] = insertelement <8 x double> [[TMP5]], double [[I82]], i32 2 @@ -22,13 +21,11 @@ define void @foo(double %i) { ; CHECK-NEXT: [[TMP17:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP16]]) ; CHECK-NEXT: br i1 [[TMP17]], label [[BB58:%.*]], label [[BB115:%.*]] ; CHECK: bb115: -; CHECK-NEXT: [[TMP18:%.*]] = fmul <2 x double> zeroinitializer, [[TMP4]] -; CHECK-NEXT: [[TMP19:%.*]] = extractelement <2 x double> [[TMP18]], i32 0 -; CHECK-NEXT: [[TMP20:%.*]] = extractelement <2 x double> [[TMP18]], i32 1 +; CHECK-NEXT: [[TMP19:%.*]] = fmul double 0.000000e+00, [[I103]] +; CHECK-NEXT: [[TMP20:%.*]] = fmul double 0.000000e+00, [[I82]] ; CHECK-NEXT: [[I118:%.*]] = fadd double [[TMP19]], [[TMP20]] ; CHECK-NEXT: [[TMP21:%.*]] = fmul <4 x double> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP22:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> poison, <4 x i32> -; CHECK-NEXT: [[TMP23:%.*]] = shufflevector <4 x double> , <4 x double> [[TMP22]], <4 x i32> +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x double> , double [[I82]], i32 3 ; CHECK-NEXT: [[TMP24:%.*]] = fadd <4 x double> [[TMP21]], [[TMP23]] ; CHECK-NEXT: [[TMP25:%.*]] = fadd <4 x double> [[TMP24]], zeroinitializer ; CHECK-NEXT: [[TMP26:%.*]] = select <4 x i1> zeroinitializer, <4 x double> zeroinitializer, <4 x double> [[TMP25]] diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll index a8160d77619c5..3d45ebdc38968 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll @@ -5,11 +5,11 @@ define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> undef, float 0.000000e+00, i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float 0.000000e+00, i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = fsub float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP4]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float 0.000000e+00, i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP5]], float 0.000000e+00, i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = fsub float 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP8]], i32 0 ; CHECK-NEXT: ret void ; entry: From c9241b02804133e31c4e83630646840bb2a18263 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Mon, 28 Apr 2025 16:25:57 -0700 Subject: [PATCH 2/3] Use correct mask Change-Id: Ie1209bb7f6c49992d41e3fec8a195a81d04f34d4 --- .../Transforms/Vectorize/SLPVectorizer.cpp | 2 +- .../SLPVectorizer/X86/buildvector-shuffle.ll | 9 +++---- .../X86/vec_list_bias-inseltpoison.ll | 25 ++++++++++--------- 3 files changed, 18 insertions(+), 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index fd58da64901f3..4f4e708bd90ad 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -14916,7 +14916,7 @@ InstructionCost BoUpSLP::getTreeCost(ArrayRef VectorizedVals, if (Mask[I] != PoisonMaskElem) ResizeMask[Mask[I]] = Mask[I]; } - if (!ShuffleVectorInst::isIdentityMask(Mask, VF)) + if (!ShuffleVectorInst::isIdentityMask(ResizeMask, VF)) C = ::getShuffleCost( *TTI, TTI::SK_PermuteSingleSrc, getWidenedType(TE->getMainOp()->getType(), VecVF), ResizeMask); diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll index e4daba253b439..40d7dde56e90b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll @@ -45,13 +45,12 @@ declare float @llvm.fmuladd.f32(float, float, float) define void @test(float %a) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[ADD_I157:%.*]] = fadd float 0.000000e+00, [[A:%.*]] -; CHECK-NEXT: [[ADD23_I:%.*]] = fadd float 0.000000e+00, [[A]] -; CHECK-NEXT: [[INSERT:%.*]] = insertelement <2 x float> zeroinitializer, float [[ADD_I157]], i64 0 -; CHECK-NEXT: [[INSERT_I:%.*]] = insertelement <2 x float> [[INSERT]], float [[ADD23_I]], i64 1 -; CHECK-NEXT: [[AGG:%.*]] = insertelement <2 x float> [[INSERT_I]], float [[ADD_I157]], i64 1 +; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> zeroinitializer, [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll index e3a6020a542fb..2cc2f28ccf6d5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/vec_list_bias-inseltpoison.ll @@ -25,7 +25,6 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T24:%.*]] = add nsw i32 [[T23]], [[T21]] ; CHECK-NEXT: [[T25:%.*]] = sub nsw i32 [[T21]], [[T23]] ; CHECK-NEXT: [[T27:%.*]] = sub nsw i32 [[T3]], [[T24]] -; CHECK-NEXT: [[T32:%.*]] = mul nsw i32 [[T27]], 6270 ; CHECK-NEXT: [[T37:%.*]] = add nsw i32 [[T25]], [[T11]] ; CHECK-NEXT: [[T38:%.*]] = add nsw i32 [[T17]], [[T5]] ; CHECK-NEXT: [[T39:%.*]] = add nsw i32 [[T37]], [[T38]] @@ -34,7 +33,6 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T42:%.*]] = mul nsw i32 [[T17]], 16819 ; CHECK-NEXT: [[T47:%.*]] = mul nsw i32 [[T37]], -16069 ; CHECK-NEXT: [[T48:%.*]] = mul nsw i32 [[T38]], -3196 -; CHECK-NEXT: [[T49:%.*]] = add nsw i32 [[T40]], [[T47]] ; CHECK-NEXT: [[TMP1:%.*]] = load <2 x i32>, ptr [[T8]], align 4 ; CHECK-NEXT: [[T15:%.*]] = load i32, ptr [[T14]], align 4 ; CHECK-NEXT: [[T9:%.*]] = load i32, ptr [[T8]], align 4 @@ -42,17 +40,20 @@ define void @test(ptr nocapture %t2) { ; CHECK-NEXT: [[T30:%.*]] = add nsw i32 [[T27]], [[T29]] ; CHECK-NEXT: [[T31:%.*]] = mul nsw i32 [[T30]], 4433 ; CHECK-NEXT: [[T34:%.*]] = mul nsw i32 [[T29]], -15137 -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x i32> [[TMP2]], i32 [[T40]], i32 1 -; CHECK-NEXT: [[TMP4:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[T48]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = add nsw <2 x i32> [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP5]], <2 x i32> poison, <8 x i32> -; CHECK-NEXT: [[T67:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[T32]], i32 2 -; CHECK-NEXT: [[T68:%.*]] = insertelement <8 x i32> [[T67]], i32 [[T49]], i32 3 -; CHECK-NEXT: [[T701:%.*]] = shufflevector <8 x i32> [[T68]], <8 x i32> [[TMP6]], <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[T40]], i32 1 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[T27]], i32 2 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[T47]], i32 3 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> , <4 x i32> [[TMP6]], <4 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[T48]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[T40]], i32 3 +; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = mul nsw <4 x i32> [[TMP5]], [[TMP9]] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <4 x i32> [[TMP10]], <4 x i32> [[TMP11]], <4 x i32> +; CHECK-NEXT: [[T701:%.*]] = shufflevector <4 x i32> [[TMP12]], <4 x i32> poison, <8 x i32> ; CHECK-NEXT: [[T71:%.*]] = insertelement <8 x i32> [[T701]], i32 [[T34]], i32 6 -; CHECK-NEXT: [[T72:%.*]] = insertelement <8 x i32> [[T71]], i32 [[T49]], i32 7 -; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T72]], splat (i32 3) +; CHECK-NEXT: [[T76:%.*]] = shl <8 x i32> [[T71]], splat (i32 3) ; CHECK-NEXT: store <8 x i32> [[T76]], ptr [[T2]], align 4 ; CHECK-NEXT: ret void ; From 2fd89792810a9cfee1bcd7b3e679fec5cdb60226 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 11 Jun 2025 07:24:26 -0700 Subject: [PATCH 3/3] Revert unchanged tests Change-Id: I9c4865641974152f6289df79df4aa057cdcdb8ed --- .../SLPVectorizer/X86/buildvector-shuffle.ll | 6 +++--- .../X86/insertelements-with-reused-indices.ll | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll index 40d7dde56e90b..f8522bc546e6b 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/buildvector-shuffle.ll @@ -46,11 +46,11 @@ define void @test(float %a) { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x float> poison, float [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <2 x float> [[TMP0]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP:%.*]] ; CHECK: loop: -; CHECK-NEXT: [[TMP2:%.*]] = fadd <2 x float> zeroinitializer, [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <2 x float> [[TMP2]], <2 x float> poison, <2 x i32> zeroinitializer +; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x float> zeroinitializer, [[SHUFFLE]] +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <2 x i32> zeroinitializer ; CHECK-NEXT: br label [[LOOP]] ; entry: diff --git a/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll b/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll index 3d45ebdc38968..a8160d77619c5 100644 --- a/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/insertelements-with-reused-indices.ll @@ -5,11 +5,11 @@ define void @test() { ; CHECK-LABEL: @test( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x float> undef, float 0.000000e+00, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP0]], float 0.000000e+00, i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP2]], float 0.000000e+00, i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP5]], float 0.000000e+00, i32 0 -; CHECK-NEXT: [[TMP8:%.*]] = fsub float 0.000000e+00, 0.000000e+00 -; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP8]], i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> [[TMP0]], float 0.000000e+00, i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x float> [[TMP1]], float 0.000000e+00, i32 0 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[TMP2]], float 0.000000e+00, i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = fsub float 0.000000e+00, 0.000000e+00 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x float> zeroinitializer, float [[TMP4]], i32 0 ; CHECK-NEXT: ret void ; entry: