diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp index 8bced3d4ca16d..b35f6d71f3945 100644 --- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp +++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp @@ -450,6 +450,8 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // TODO: Evaluate whether that always results in lowest cost. Alternatively, // check the cost of creating a broadcast shuffle and shuffling both // operands to element 0. + unsigned BestExtIndex = Extract0Cost > Extract1Cost ? Ext0Index : Ext1Index; + unsigned BestInsIndex = Extract0Cost > Extract1Cost ? Ext1Index : Ext0Index; InstructionCost CheapExtractCost = std::min(Extract0Cost, Extract1Cost); // Extra uses of the extracts mean that we include those costs in the @@ -485,8 +487,18 @@ bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0, // ShufMask = { poison, poison, 0, poison } // TODO: The cost model has an option for a "broadcast" shuffle // (splat-from-element-0), but no option for a more general splat. - NewCost += - TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy); + if (auto *FixedVecTy = dyn_cast(VecTy)) { + SmallVector ShuffleMask(FixedVecTy->getNumElements(), + PoisonMaskElem); + ShuffleMask[BestInsIndex] = BestExtIndex; + NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, + VecTy, ShuffleMask, CostKind, 0, nullptr, + {ConvertToShuffle}); + } else { + NewCost += + TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy, + {}, CostKind, 0, nullptr, {ConvertToShuffle}); + } } // Aggressively form a vector op if the cost is equal because the transform diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll index d0568f3b961fd..4e1051d1991aa 100644 --- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE,SSE4 -; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3 -S < %s | FileCheck %s --check-prefixes=AVX,AVX2 -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE,SSE2 -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE,SSE4 -; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX,AVX1 -; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX,AVX2 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -O3 -S < %s | FileCheck %s --check-prefixes=SSE2 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -O3 -S < %s | FileCheck %s --check-prefixes=SSE4 +; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -O3 -S < %s | FileCheck %s --check-prefixes=AVX +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -O3 -S < %s | FileCheck %s --check-prefixes=AVX +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE2 +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v2 -passes="default" -S < %s | FileCheck %s --check-prefixes=SSE4 +; RUN: opt -mtriple=x86_64-- -mcpu=btver2 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX +; RUN: opt -mtriple=x86_64-- -mcpu=x86-64-v3 -passes="default" -S < %s | FileCheck %s --check-prefixes=AVX define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { ; SSE2-LABEL: @PR50392( @@ -30,24 +30,14 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { ; SSE4-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD12]], i64 3 ; SSE4-NEXT: ret <4 x double> [[SHUFFLE]] ; -; AVX1-LABEL: @PR50392( -; AVX1-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; AVX1-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; AVX1-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; AVX1-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2 -; AVX1-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3 -; AVX1-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]] -; AVX1-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP4]], double [[ADD12]], i64 3 -; AVX1-NEXT: ret <4 x double> [[SHUFFLE]] -; -; AVX2-LABEL: @PR50392( -; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> -; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> -; AVX2-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] -; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> -; AVX2-NEXT: [[TMP5:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] -; AVX2-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP5]], <4 x i32> -; AVX2-NEXT: ret <4 x double> [[SHUFFLE]] +; AVX-LABEL: @PR50392( +; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> +; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> +; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]] +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B]], <4 x double> poison, <4 x i32> +; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[B]], [[SHIFT]] +; AVX-NEXT: [[SHUFFLE:%.*]] = shufflevector <4 x double> [[TMP3]], <4 x double> [[TMP4]], <4 x i32> +; AVX-NEXT: ret <4 x double> [[SHUFFLE]] ; %vecext = extractelement <4 x double> %a, i32 0 %vecext1 = extractelement <4 x double> %a, i32 1 @@ -68,6 +58,3 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) { %shuffle = shufflevector <4 x double> %vecinit13, <4 x double> %a, <4 x i32> ret <4 x double> %shuffle } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; AVX: {{.*}} -; SSE: {{.*}} diff --git a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll index 613c71cb769b0..551d6d1cabd41 100644 --- a/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll +++ b/llvm/test/Transforms/VectorCombine/AArch64/load-extractelement-scalarization.ll @@ -541,18 +541,18 @@ define i32 @load_extract_clobber_store_between(ptr %x, ptr %y) { define i32 @load_extract_clobber_store_between_limit(ptr %x, ptr %y, <8 x i32> %z) { ; CHECK-LABEL: @load_extract_clobber_store_between_limit( ; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[Z_0:%.*]] = extractelement <8 x i32> [[Z:%.*]], i32 0 -; CHECK-NEXT: [[Z_1:%.*]] = extractelement <8 x i32> [[Z]], i32 1 -; CHECK-NEXT: [[ADD_0:%.*]] = add i32 [[Z_0]], [[Z_1]] -; CHECK-NEXT: [[Z_2:%.*]] = extractelement <8 x i32> [[Z]], i32 2 -; CHECK-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[Z_2]] -; CHECK-NEXT: [[Z_3:%.*]] = extractelement <8 x i32> [[Z]], i32 3 -; CHECK-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[Z_3]] -; CHECK-NEXT: [[Z_4:%.*]] = extractelement <8 x i32> [[Z]], i32 4 -; CHECK-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[Z_4]] +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[Z1:%.*]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[Z1]], [[SHIFT]] +; CHECK-NEXT: [[SHIFT1:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = add <8 x i32> [[TMP1]], [[SHIFT1]] +; CHECK-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[SHIFT2]] +; CHECK-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[Z:%.*]] = add <8 x i32> [[TMP3]], [[SHIFT3]] +; CHECK-NEXT: [[Z_0:%.*]] = extractelement <8 x i32> [[Z]], i32 0 ; CHECK-NEXT: store i8 0, ptr [[Y:%.*]], align 1 ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2 -; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_3]], [[R]] +; CHECK-NEXT: [[ADD_4:%.*]] = add i32 [[Z_0]], [[R]] ; CHECK-NEXT: ret i32 [[ADD_4]] ; %lv = load <4 x i32>, ptr %x @@ -573,35 +573,35 @@ define i32 @load_extract_clobber_store_between_limit(ptr %x, ptr %y, <8 x i32> % define i32 @load_extract_clobber_store_after_limit(ptr %x, ptr %y, <8 x i32> %z) { ; LIMIT-DEFAULT-LABEL: @load_extract_clobber_store_after_limit( -; LIMIT-DEFAULT-NEXT: [[Z_0:%.*]] = extractelement <8 x i32> [[Z:%.*]], i32 0 -; LIMIT-DEFAULT-NEXT: [[Z_1:%.*]] = extractelement <8 x i32> [[Z]], i32 1 -; LIMIT-DEFAULT-NEXT: [[ADD_0:%.*]] = add i32 [[Z_0]], [[Z_1]] -; LIMIT-DEFAULT-NEXT: [[Z_2:%.*]] = extractelement <8 x i32> [[Z]], i32 2 -; LIMIT-DEFAULT-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[Z_2]] -; LIMIT-DEFAULT-NEXT: [[Z_3:%.*]] = extractelement <8 x i32> [[Z]], i32 3 -; LIMIT-DEFAULT-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[Z_3]] -; LIMIT-DEFAULT-NEXT: [[Z_4:%.*]] = extractelement <8 x i32> [[Z]], i32 4 -; LIMIT-DEFAULT-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[Z_4]] +; LIMIT-DEFAULT-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[Z1:%.*]], <8 x i32> poison, <8 x i32> +; LIMIT-DEFAULT-NEXT: [[TMP4:%.*]] = add <8 x i32> [[Z1]], [[SHIFT]] +; LIMIT-DEFAULT-NEXT: [[SHIFT1:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; LIMIT-DEFAULT-NEXT: [[TMP2:%.*]] = add <8 x i32> [[TMP4]], [[SHIFT1]] +; LIMIT-DEFAULT-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; LIMIT-DEFAULT-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[SHIFT2]] +; LIMIT-DEFAULT-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; LIMIT-DEFAULT-NEXT: [[Z:%.*]] = add <8 x i32> [[TMP3]], [[SHIFT3]] +; LIMIT-DEFAULT-NEXT: [[Z_0:%.*]] = extractelement <8 x i32> [[Z]], i32 0 ; LIMIT-DEFAULT-NEXT: [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[X:%.*]], i32 0, i32 2 ; LIMIT-DEFAULT-NEXT: [[R:%.*]] = load i32, ptr [[TMP1]], align 8 ; LIMIT-DEFAULT-NEXT: store i8 0, ptr [[Y:%.*]], align 1 -; LIMIT-DEFAULT-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_3]], [[R]] +; LIMIT-DEFAULT-NEXT: [[ADD_4:%.*]] = add i32 [[Z_0]], [[R]] ; LIMIT-DEFAULT-NEXT: ret i32 [[ADD_4]] ; ; LIMIT2-LABEL: @load_extract_clobber_store_after_limit( ; LIMIT2-NEXT: [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16 -; LIMIT2-NEXT: [[Z_0:%.*]] = extractelement <8 x i32> [[Z:%.*]], i32 0 -; LIMIT2-NEXT: [[Z_1:%.*]] = extractelement <8 x i32> [[Z]], i32 1 -; LIMIT2-NEXT: [[ADD_0:%.*]] = add i32 [[Z_0]], [[Z_1]] -; LIMIT2-NEXT: [[Z_2:%.*]] = extractelement <8 x i32> [[Z]], i32 2 -; LIMIT2-NEXT: [[ADD_1:%.*]] = add i32 [[ADD_0]], [[Z_2]] -; LIMIT2-NEXT: [[Z_3:%.*]] = extractelement <8 x i32> [[Z]], i32 3 -; LIMIT2-NEXT: [[ADD_2:%.*]] = add i32 [[ADD_1]], [[Z_3]] -; LIMIT2-NEXT: [[Z_4:%.*]] = extractelement <8 x i32> [[Z]], i32 4 -; LIMIT2-NEXT: [[ADD_3:%.*]] = add i32 [[ADD_2]], [[Z_4]] +; LIMIT2-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[Z1:%.*]], <8 x i32> poison, <8 x i32> +; LIMIT2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[Z1]], [[SHIFT]] +; LIMIT2-NEXT: [[SHIFT1:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; LIMIT2-NEXT: [[TMP2:%.*]] = add <8 x i32> [[TMP1]], [[SHIFT1]] +; LIMIT2-NEXT: [[SHIFT2:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; LIMIT2-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[SHIFT2]] +; LIMIT2-NEXT: [[SHIFT3:%.*]] = shufflevector <8 x i32> [[Z1]], <8 x i32> poison, <8 x i32> +; LIMIT2-NEXT: [[Z:%.*]] = add <8 x i32> [[TMP3]], [[SHIFT3]] +; LIMIT2-NEXT: [[Z_0:%.*]] = extractelement <8 x i32> [[Z]], i32 0 ; LIMIT2-NEXT: [[R:%.*]] = extractelement <4 x i32> [[LV]], i32 2 ; LIMIT2-NEXT: store i8 0, ptr [[Y:%.*]], align 1 -; LIMIT2-NEXT: [[ADD_4:%.*]] = add i32 [[ADD_3]], [[R]] +; LIMIT2-NEXT: [[ADD_4:%.*]] = add i32 [[Z_0]], [[R]] ; LIMIT2-NEXT: ret i32 [[ADD_4]] ; %lv = load <4 x i32>, ptr %x @@ -671,9 +671,9 @@ define i1 @load_with_non_power_of_2_element_type_2(ptr %x) { define i32 @load_multiple_extracts_with_constant_idx(ptr %x) { ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx( ; CHECK-NEXT: [[LV:%.*]] = load <4 x i32>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[E_0:%.*]] = extractelement <4 x i32> [[LV]], i32 0 -; CHECK-NEXT: [[E_1:%.*]] = extractelement <4 x i32> [[LV]], i32 1 -; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[LV]], <4 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[LV]], [[SHIFT]] +; CHECK-NEXT: [[RES:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: ret i32 [[RES]] ; %lv = load <4 x i32>, ptr %x @@ -688,9 +688,9 @@ define i32 @load_multiple_extracts_with_constant_idx(ptr %x) { define i32 @load_multiple_extracts_with_constant_idx_profitable(ptr %x) { ; CHECK-LABEL: @load_multiple_extracts_with_constant_idx_profitable( ; CHECK-NEXT: [[LV:%.*]] = load <8 x i32>, ptr [[X:%.*]], align 16 -; CHECK-NEXT: [[E_0:%.*]] = extractelement <8 x i32> [[LV]], i32 0 -; CHECK-NEXT: [[E_1:%.*]] = extractelement <8 x i32> [[LV]], i32 6 -; CHECK-NEXT: [[RES:%.*]] = add i32 [[E_0]], [[E_1]] +; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[LV]], <8 x i32> poison, <8 x i32> +; CHECK-NEXT: [[TMP1:%.*]] = add <8 x i32> [[LV]], [[SHIFT]] +; CHECK-NEXT: [[RES:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 ; CHECK-NEXT: ret i32 [[RES]] ; %lv = load <8 x i32>, ptr %x, align 16 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll index bbf0db677461e..3cf2940e86808 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll @@ -404,11 +404,17 @@ define float @ext0_ext8_fmul_v16f32(<16 x float> %x) { } define float @ext14_ext15_fmul_v16f32(<16 x float> %x) { -; CHECK-LABEL: @ext14_ext15_fmul_v16f32( -; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14 -; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15 -; CHECK-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]] -; CHECK-NEXT: ret float [[R]] +; SSE-LABEL: @ext14_ext15_fmul_v16f32( +; SSE-NEXT: [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14 +; SSE-NEXT: [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15 +; SSE-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]] +; SSE-NEXT: ret float [[R]] +; +; AVX-LABEL: @ext14_ext15_fmul_v16f32( +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x float> [[X:%.*]], <16 x float> poison, <16 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[X]], [[SHIFT]] +; AVX-NEXT: [[R:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 +; AVX-NEXT: ret float [[R]] ; %e0 = extractelement <16 x float> %x, i32 14 %e1 = extractelement <16 x float> %x, i32 15 diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll index 284d2859304eb..a08506840572c 100644 --- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll +++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll @@ -404,11 +404,17 @@ define float @ext0_ext8_fmul_v16f32(<16 x float> %x) { } define float @ext14_ext15_fmul_v16f32(<16 x float> %x) { -; CHECK-LABEL: @ext14_ext15_fmul_v16f32( -; CHECK-NEXT: [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14 -; CHECK-NEXT: [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15 -; CHECK-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]] -; CHECK-NEXT: ret float [[R]] +; SSE-LABEL: @ext14_ext15_fmul_v16f32( +; SSE-NEXT: [[E0:%.*]] = extractelement <16 x float> [[X:%.*]], i32 14 +; SSE-NEXT: [[E1:%.*]] = extractelement <16 x float> [[X]], i32 15 +; SSE-NEXT: [[R:%.*]] = fadd float [[E0]], [[E1]] +; SSE-NEXT: ret float [[R]] +; +; AVX-LABEL: @ext14_ext15_fmul_v16f32( +; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <16 x float> [[X:%.*]], <16 x float> poison, <16 x i32> +; AVX-NEXT: [[TMP1:%.*]] = fadd <16 x float> [[X]], [[SHIFT]] +; AVX-NEXT: [[R:%.*]] = extractelement <16 x float> [[TMP1]], i32 14 +; AVX-NEXT: ret float [[R]] ; %e0 = extractelement <16 x float> %x, i32 14 %e1 = extractelement <16 x float> %x, i32 15