diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 332c52040e21c..cd251e528669e 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -106,6 +106,7 @@ class VectorCombine { Instruction &I); bool foldExtractExtract(Instruction &I); bool foldInsExtFNeg(Instruction &I); + bool foldInsExtVectorToShuffle(Instruction &I); bool foldBitcastShuffle(Instruction &I); bool scalarizeBinopOrCmp(Instruction &I); bool scalarizeVPIntrinsic(Instruction &I); @@ -2780,6 +2781,51 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) { return true; } +/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) --> +/// shuffle (DstVec, SrcVec, Mask) +bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) { + Value *DstVec, *SrcVec; + uint64_t ExtIdx, InsIdx; + if (!match(&I, + m_InsertElt(m_Value(DstVec), + m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)), + m_ConstantInt(InsIdx)))) + return false; + + auto *VecTy = dyn_cast(I.getType()); + if (!VecTy || SrcVec->getType() != VecTy) + return false; + + unsigned NumElts = VecTy->getNumElements(); + if (ExtIdx >= NumElts || InsIdx >= NumElts) + return false; + + SmallVector Mask(NumElts, 0); + std::iota(Mask.begin(), Mask.end(), 0); + Mask[InsIdx] = ExtIdx + NumElts; + // Cost + auto *Ins = cast(&I); + auto *Ext = cast(I.getOperand(1)); + + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput; + InstructionCost OldCost = + TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx) + + TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx); + + InstructionCost NewCost = + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask); + if (!Ext->hasOneUse()) + NewCost += TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx); + + if (OldCost < NewCost) + return false; + + Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask); + replaceValue(I, *Shuf); + + return true; +} + /// This is the entry point for all transforms. Pass manager differences are /// handled in the callers of this function. bool VectorCombine::run() { @@ -2836,6 +2882,7 @@ bool VectorCombine::run() { switch (Opcode) { case Instruction::InsertElement: MadeChange |= foldInsExtFNeg(I); + MadeChange |= foldInsExtVectorToShuffle(I); break; case Instruction::ShuffleVector: MadeChange |= foldPermuteOfBinops(I); diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index 3d69f15fc5f24..bbf0db677461e 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -420,8 +420,7 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @ins_bo_ext_ext( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]] -; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3 +; CHECK-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[V3]] ; %a2 = extractelement <4 x float> %a, i32 2 @@ -435,13 +434,21 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) { ; but it is likely that extracting from index 3 is the better option. define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: @ins_bo_ext_ext_uses( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: call void @use_f32(float [[A23]]) -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; SSE-LABEL: @ins_bo_ext_ext_uses( +; SSE-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; SSE-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE-NEXT: call void @use_f32(float [[A23]]) +; SSE-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> +; SSE-NEXT: ret <4 x float> [[V3]] +; +; AVX-LABEL: @ins_bo_ext_ext_uses( +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; AVX-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; AVX-NEXT: call void @use_f32(float [[A23]]) +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3 +; AVX-NEXT: ret <4 x float> [[V3]] ; %a2 = extractelement <4 x float> %a, i32 2 %a3 = extractelement <4 x float> %a, i32 3 @@ -452,22 +459,34 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { } define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] -; CHECK-NEXT: [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] -; CHECK-NEXT: [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> poison, float [[A23]], i32 1 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; SSE-LABEL: @PR34724( +; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SSE-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; SSE-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] +; SSE-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] +; SSE-NEXT: [[V1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP1]], <4 x i32> +; SSE-NEXT: [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> +; SSE-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> +; SSE-NEXT: ret <4 x float> [[V3]] +; +; AVX-LABEL: @PR34724( +; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; AVX-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; AVX-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] +; AVX-NEXT: [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; AVX-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] +; AVX-NEXT: [[V1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP1]], <4 x i32> +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2 +; AVX-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> +; AVX-NEXT: ret <4 x float> [[V3]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index 52f7cd859a1ab..284d2859304eb 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -420,8 +420,7 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @ins_bo_ext_ext( ; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> ; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]] -; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3 +; CHECK-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> ; CHECK-NEXT: ret <4 x float> [[V3]] ; %a2 = extractelement <4 x float> %a, i32 2 @@ -435,13 +434,21 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) { ; but it is likely that extracting from index 3 is the better option. define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: @ins_bo_ext_ext_uses( -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: call void @use_f32(float [[A23]]) -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; SSE-LABEL: @ins_bo_ext_ext_uses( +; SSE-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; SSE-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SSE-NEXT: call void @use_f32(float [[A23]]) +; SSE-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> +; SSE-NEXT: ret <4 x float> [[V3]] +; +; AVX-LABEL: @ins_bo_ext_ext_uses( +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; AVX-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; AVX-NEXT: call void @use_f32(float [[A23]]) +; AVX-NEXT: [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3 +; AVX-NEXT: ret <4 x float> [[V3]] ; %a2 = extractelement <4 x float> %a, i32 2 %a3 = extractelement <4 x float> %a, i32 3 @@ -452,22 +459,34 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) { } define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) { -; CHECK-LABEL: @PR34724( -; CHECK-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 -; CHECK-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 -; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] -; CHECK-NEXT: [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 -; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] -; CHECK-NEXT: [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] -; CHECK-NEXT: [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3 -; CHECK-NEXT: [[V1:%.*]] = insertelement <4 x float> undef, float [[A23]], i32 1 -; CHECK-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2 -; CHECK-NEXT: [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3 -; CHECK-NEXT: ret <4 x float> [[V3]] +; SSE-LABEL: @PR34724( +; SSE-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; SSE-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; SSE-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; SSE-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] +; SSE-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] +; SSE-NEXT: [[V1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP1]], <4 x i32> +; SSE-NEXT: [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> +; SSE-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> +; SSE-NEXT: ret <4 x float> [[V3]] +; +; AVX-LABEL: @PR34724( +; AVX-NEXT: [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0 +; AVX-NEXT: [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1 +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]] +; AVX-NEXT: [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]] +; AVX-NEXT: [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; AVX-NEXT: [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]] +; AVX-NEXT: [[V1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP1]], <4 x i32> +; AVX-NEXT: [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2 +; AVX-NEXT: [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> +; AVX-NEXT: ret <4 x float> [[V3]] ; %a0 = extractelement <4 x float> %a, i32 0 %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll index 64c86a741b177..3dae93665b1ed 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll @@ -163,8 +163,7 @@ define <4 x i1> @ins_fcmp_ext_ext(<4 x float> %a, <4 x i1> %b) { ; AVX-LABEL: @ins_fcmp_ext_ext( ; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> ; AVX-NEXT: [[TMP1:%.*]] = fcmp ugt <4 x float> [[A]], [[SHIFT]] -; AVX-NEXT: [[A21:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2 -; AVX-NEXT: [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2 +; AVX-NEXT: [[R:%.*]] = shufflevector <4 x i1> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> ; AVX-NEXT: ret <4 x i1> [[R]] ; %a1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll index c4aba63568e2f..937d4043adc0c 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll @@ -546,8 +546,7 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc ; CHECK-NEXT: [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0 ; CHECK-NEXT: [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0 -; CHECK-NEXT: [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1 -; CHECK-NEXT: [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1 +; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> ; CHECK-NEXT: store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll index d5c19b35838d7..bdd05a1a37c70 100644 --- a/llvm/test/Transforms/VectorCombine/X86/load.ll +++ b/llvm/test/Transforms/VectorCombine/X86/load.ll @@ -529,8 +529,7 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc ; CHECK-NEXT: [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]] ; CHECK-NEXT: [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0 ; CHECK-NEXT: [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0 -; CHECK-NEXT: [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1 -; CHECK-NEXT: [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1 +; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> ; CHECK-NEXT: store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8 ; CHECK-NEXT: ret void ;