Skip to content
67 changes: 37 additions & 30 deletions llvm/lib/Transforms/Vectorize/VectorCombine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -696,79 +696,86 @@ bool VectorCombine::foldExtractExtract(Instruction &I) {
/// shuffle.
bool VectorCombine::foldInsExtFNeg(Instruction &I) {
// Match an insert (op (extract)) pattern.
Value *DestVec;
uint64_t Index;
Value *DstVec;
uint64_t ExtIdx, InsIdx;
Instruction *FNeg;
if (!match(&I, m_InsertElt(m_Value(DestVec), m_OneUse(m_Instruction(FNeg)),
m_ConstantInt(Index))))
if (!match(&I, m_InsertElt(m_Value(DstVec), m_OneUse(m_Instruction(FNeg)),
m_ConstantInt(InsIdx))))
return false;

// Note: This handles the canonical fneg instruction and "fsub -0.0, X".
Value *SrcVec;
Instruction *Extract;
if (!match(FNeg, m_FNeg(m_CombineAnd(
m_Instruction(Extract),
m_ExtractElt(m_Value(SrcVec), m_SpecificInt(Index))))))
m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx))))))
return false;

auto *VecTy = cast<FixedVectorType>(I.getType());
auto *ScalarTy = VecTy->getScalarType();
auto *DstVecTy = cast<FixedVectorType>(DstVec->getType());
auto *DstVecScalarTy = DstVecTy->getScalarType();
auto *SrcVecTy = dyn_cast<FixedVectorType>(SrcVec->getType());
if (!SrcVecTy || ScalarTy != SrcVecTy->getScalarType())
if (!SrcVecTy || DstVecScalarTy != SrcVecTy->getScalarType())
return false;

// Ignore bogus insert/extract index.
unsigned NumElts = VecTy->getNumElements();
if (Index >= NumElts)
// Ignore if insert/extract index is out of bounds or destination vector has
// one element
unsigned NumDstElts = DstVecTy->getNumElements();
unsigned NumSrcElts = SrcVecTy->getNumElements();
if (ExtIdx > NumSrcElts || InsIdx >= NumDstElts || NumDstElts == 1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ExtIdx >= NumSrcElts ?

Copy link
Contributor Author

@ParkHanbum ParkHanbum Nov 5, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

When the Vector Element is 1, the extractable element is 0. so, equal may exceed the bounds of SrcVector.

return false;

// We are inserting the negated element into the same lane that we extracted
// from. This is equivalent to a select-shuffle that chooses all but the
// negated element from the destination vector.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Update the comment

SmallVector<int> Mask(NumElts);
SmallVector<int> Mask(NumDstElts);
std::iota(Mask.begin(), Mask.end(), 0);
Mask[Index] = Index + NumElts;
Mask[InsIdx] = (ExtIdx % NumDstElts) + NumDstElts;
InstructionCost OldCost =
TTI.getArithmeticInstrCost(Instruction::FNeg, ScalarTy, CostKind) +
TTI.getVectorInstrCost(I, VecTy, CostKind, Index);
TTI.getArithmeticInstrCost(Instruction::FNeg, DstVecScalarTy, CostKind) +
TTI.getVectorInstrCost(I, DstVecTy, CostKind, InsIdx);

// If the extract has one use, it will be eliminated, so count it in the
// original cost. If it has more than one use, ignore the cost because it will
// be the same before/after.
if (Extract->hasOneUse())
OldCost += TTI.getVectorInstrCost(*Extract, VecTy, CostKind, Index);
OldCost += TTI.getVectorInstrCost(*Extract, SrcVecTy, CostKind, ExtIdx);

InstructionCost NewCost =
TTI.getArithmeticInstrCost(Instruction::FNeg, VecTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, VecTy,
Mask, CostKind);
TTI.getArithmeticInstrCost(Instruction::FNeg, SrcVecTy, CostKind) +
TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy,
DstVecTy, Mask, CostKind);

bool NeedLenChg = SrcVecTy->getNumElements() != NumElts;
bool NeedLenChg = SrcVecTy->getNumElements() != NumDstElts;
// If the lengths of the two vectors are not equal,
// we need to add a length-change vector. Add this cost.
SmallVector<int> SrcMask;
if (NeedLenChg) {
SrcMask.assign(NumElts, PoisonMaskElem);
SrcMask[Index] = Index;
SrcMask.assign(NumDstElts, PoisonMaskElem);
SrcMask[ExtIdx % NumDstElts] = ExtIdx;
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc,
VecTy, SrcVecTy, SrcMask, CostKind);
DstVecTy, SrcVecTy, SrcMask, CostKind);
}

LLVM_DEBUG(dbgs() << "Found an insertion of (extract)fneg : " << I
<< "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
<< "\n");
if (NewCost > OldCost)
return false;

Value *NewShuf;
// insertelt DestVec, (fneg (extractelt SrcVec, Index)), Index
Value *NewShuf, *LenChgShuf = nullptr;
// insertelt DstVec, (fneg (extractelt SrcVec, Index)), Index
Value *VecFNeg = Builder.CreateFNegFMF(SrcVec, FNeg);
if (NeedLenChg) {
// shuffle DestVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
Value *LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
NewShuf = Builder.CreateShuffleVector(DestVec, LenChgShuf, Mask);
// shuffle DstVec, (shuffle (fneg SrcVec), poison, SrcMask), Mask
LenChgShuf = Builder.CreateShuffleVector(VecFNeg, SrcMask);
NewShuf = Builder.CreateShuffleVector(DstVec, LenChgShuf, Mask);
Worklist.pushValue(LenChgShuf);
} else {
// shuffle DestVec, (fneg SrcVec), Mask
NewShuf = Builder.CreateShuffleVector(DestVec, VecFNeg, Mask);
// shuffle DstVec, (fneg SrcVec), Mask
NewShuf = Builder.CreateShuffleVector(DstVec, VecFNeg, Mask);
}

Worklist.pushValue(VecFNeg);
replaceValue(I, *NewShuf);
return true;
}
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/Transforms/PhaseOrdering/X86/addsub-inseltpoison.ll
Original file line number Diff line number Diff line change
Expand Up @@ -498,11 +498,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
; PR58139
define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: @_mm_complexmult_pd_naive(
; SSE-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
; SSE-NEXT: [[TMP1:%.*]] = fneg double [[B1]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
; SSE-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]]
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
; SSE-NEXT: [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
Expand Down
6 changes: 2 additions & 4 deletions llvm/test/Transforms/PhaseOrdering/X86/addsub.ll
Original file line number Diff line number Diff line change
Expand Up @@ -502,11 +502,9 @@ define void @add_aggregate_store(<2 x float> %a0, <2 x float> %a1, <2 x float> %
; PR58139
define <2 x double> @_mm_complexmult_pd_naive(<2 x double> %a, <2 x double> %b) {
; SSE-LABEL: @_mm_complexmult_pd_naive(
; SSE-NEXT: [[B1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 1
; SSE-NEXT: [[TMP1:%.*]] = fneg double [[B1]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
; SSE-NEXT: [[TMP3:%.*]] = shufflevector <2 x double> [[B]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
; SSE-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[TMP1]], i64 0
; SSE-NEXT: [[TMP3:%.*]] = fneg <2 x double> [[B:%.*]]
; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[B]], <2 x i32> <i32 1, i32 2>
; SSE-NEXT: [[TMP5:%.*]] = fmul <2 x double> [[TMP2]], [[TMP4]]
; SSE-NEXT: [[TMP6:%.*]] = shufflevector <2 x double> [[A]], <2 x double> poison, <2 x i32> zeroinitializer
; SSE-NEXT: [[TMP7:%.*]] = tail call <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[TMP6]], <2 x double> [[B]], <2 x double> [[TMP5]])
Expand Down
150 changes: 115 additions & 35 deletions llvm/test/Transforms/VectorCombine/X86/extract-fneg-insert.ll
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,19 @@ define <4 x float> @ext2_v2f32v4f32(<2 x float> %x, <4 x float> %y) {
ret <4 x float> %r
}

define <2 x float> @ext2_v4f32v2f32(<4 x float> %x, <2 x float> %y) {
; CHECK-LABEL: @ext2_v4f32v2f32(
; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 poison, i32 3>
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x float> [[Y:%.*]], <2 x float> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: ret <2 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
%n = fneg float %e
%r = insertelement <2 x float> %y, float %n, i32 1
ret <2 x float> %r
}

; Eliminating extract/insert is still profitable. Flags propagate.

define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
Expand All @@ -73,24 +86,31 @@ define <2 x double> @ext1_v2f64(<2 x double> %x, <2 x double> %y) {
}

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we have a test case where the dst vector is larger than the src vector?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll update it ASAP

define <4 x double> @ext1_v2f64v4f64(<2 x double> %x, <4 x double> %y) {
; SSE-LABEL: @ext1_v2f64v4f64(
; SSE-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; SSE-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; SSE-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 1
; SSE-NEXT: ret <4 x double> [[R]]
;
; AVX-LABEL: @ext1_v2f64v4f64(
; AVX-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
; AVX-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
; AVX-NEXT: ret <4 x double> [[R]]
; CHECK-LABEL: @ext1_v2f64v4f64(
; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 0, i32 5, i32 2, i32 3>
; CHECK-NEXT: ret <4 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
%n = fneg nsz double %e
%r = insertelement <4 x double> %y, double %n, i32 1
ret <4 x double> %r
}

define <2 x double> @ext1_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext1_v4f64v2f64(
; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: ret <2 x double> [[R]]
;
%e = extractelement <4 x double> %x, i32 3
%n = fneg nsz double %e
%r = insertelement <2 x double> %y, double %n, i32 1
ret <2 x double> %r
}

define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v8f32(
; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
Expand All @@ -105,9 +125,9 @@ define <8 x float> @ext7_v8f32(<8 x float> %x, <8 x float> %y) {

define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v4f32v8f32(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 7
; CHECK-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 11>
; CHECK-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
Expand All @@ -116,6 +136,19 @@ define <8 x float> @ext7_v4f32v8f32(<4 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}

define <4 x float> @ext7_v8f32v4f32(<8 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext7_v8f32v4f32(
; CHECK-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
; CHECK-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <8 x float> %x, i32 7
%n = fneg float %e
%r = insertelement <4 x float> %y, float %n, i32 3
ret <4 x float> %r
}

; Same as above with an extra use of the extracted element.

define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
Expand All @@ -141,12 +174,20 @@ define <8 x float> @ext7_v8f32_use1(<8 x float> %x, <8 x float> %y) {
}

define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
; CHECK-LABEL: @ext7_v4f32v8f32_use1(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
; CHECK-NEXT: call void @use(float [[E]])
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
; CHECK-NEXT: ret <8 x float> [[R]]
; SSE-LABEL: @ext7_v4f32v8f32_use1(
; SSE-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
; SSE-NEXT: call void @use(float [[E]])
; SSE-NEXT: [[TMP1:%.*]] = fneg <4 x float> [[X]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE-NEXT: [[R:%.*]] = shufflevector <8 x float> [[Y:%.*]], <8 x float> [[TMP2]], <8 x i32> <i32 0, i32 1, i32 2, i32 11, i32 4, i32 5, i32 6, i32 7>
; SSE-NEXT: ret <8 x float> [[R]]
;
; AVX-LABEL: @ext7_v4f32v8f32_use1(
; AVX-NEXT: [[E:%.*]] = extractelement <4 x float> [[X:%.*]], i32 3
; AVX-NEXT: call void @use(float [[E]])
; AVX-NEXT: [[N:%.*]] = fneg float [[E]]
; AVX-NEXT: [[R:%.*]] = insertelement <8 x float> [[Y:%.*]], float [[N]], i32 3
; AVX-NEXT: ret <8 x float> [[R]]
;
%e = extractelement <4 x float> %x, i32 3
call void @use(float %e)
Expand All @@ -155,6 +196,29 @@ define <8 x float> @ext7_v4f32v8f32_use1(<4 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}

define <4 x float> @ext7_v8f32v4f32_use1(<8 x float> %x, <4 x float> %y) {
; SSE-LABEL: @ext7_v8f32v4f32_use1(
; SSE-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
; SSE-NEXT: call void @use(float [[E]])
; SSE-NEXT: [[TMP1:%.*]] = fneg <8 x float> [[X]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[TMP1]], <8 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 7>
; SSE-NEXT: [[R:%.*]] = shufflevector <4 x float> [[Y:%.*]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
; SSE-NEXT: ret <4 x float> [[R]]
;
; AVX-LABEL: @ext7_v8f32v4f32_use1(
; AVX-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
; AVX-NEXT: call void @use(float [[E]])
; AVX-NEXT: [[N:%.*]] = fneg float [[E]]
; AVX-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
; AVX-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <8 x float> %x, i32 7
call void @use(float %e)
%n = fneg float %e
%r = insertelement <4 x float> %y, float %n, i32 3
ret <4 x float> %r
}

; Negative test - the transform is likely not profitable if the fneg has another use.

define <8 x float> @ext7_v8f32_use2(<8 x float> %x, <8 x float> %y) {
Expand Down Expand Up @@ -187,6 +251,21 @@ define <8 x float> @ext7_v4f32v8f32_use2(<4 x float> %x, <8 x float> %y) {
ret <8 x float> %r
}

define <4 x float> @ext7_v8f32v4f32_use2(<8 x float> %x, <4 x float> %y) {
; CHECK-LABEL: @ext7_v8f32v4f32_use2(
; CHECK-NEXT: [[E:%.*]] = extractelement <8 x float> [[X:%.*]], i32 7
; CHECK-NEXT: [[N:%.*]] = fneg float [[E]]
; CHECK-NEXT: call void @use(float [[N]])
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> [[Y:%.*]], float [[N]], i32 3
; CHECK-NEXT: ret <4 x float> [[R]]
;
%e = extractelement <8 x float> %x, i32 7
%n = fneg float %e
call void @use(float %n)
%r = insertelement <4 x float> %y, float %n, i32 3
ret <4 x float> %r
}

; Negative test - can't convert variable index to a shuffle.

define <2 x double> @ext_index_var_v2f64(<2 x double> %x, <2 x double> %y, i32 %index) {
Expand Down Expand Up @@ -215,14 +294,10 @@ define <4 x double> @ext_index_var_v2f64v4f64(<2 x double> %x, <4 x double> %y,
ret <4 x double> %r
}

; Negative test - require same extract/insert index for simple shuffle.
; TODO: We could handle this by adjusting the cost calculation.

define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext1_v2f64_ins0(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 0
; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP1]], <2 x i32> <i32 3, i32 1>
; CHECK-NEXT: ret <2 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
Expand All @@ -231,12 +306,11 @@ define <2 x double> @ext1_v2f64_ins0(<2 x double> %x, <2 x double> %y) {
ret <2 x double> %r
}

; Negative test - extract from an index greater than the vector width of the destination
define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
; CHECK-LABEL: @ext3_v4f64v2f64(
; CHECK-NEXT: [[E:%.*]] = extractelement <4 x double> [[X:%.*]], i32 3
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <2 x double> [[Y:%.*]], double [[N]], i32 1
; CHECK-NEXT: [[TMP1:%.*]] = fneg nsz <4 x double> [[X:%.*]]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <2 x i32> <i32 poison, i32 3>
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x double> [[Y:%.*]], <2 x double> [[TMP2]], <2 x i32> <i32 0, i32 3>
; CHECK-NEXT: ret <2 x double> [[R]]
;
%e = extractelement <4 x double> %x, i32 3
Expand All @@ -246,11 +320,17 @@ define <2 x double> @ext3_v4f64v2f64(<4 x double> %x, <2 x double> %y) {
}

define <4 x double> @ext1_v2f64v4f64_ins0(<2 x double> %x, <4 x double> %y) {
; CHECK-LABEL: @ext1_v2f64v4f64_ins0(
; CHECK-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; CHECK-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; CHECK-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
; CHECK-NEXT: ret <4 x double> [[R]]
; SSE-LABEL: @ext1_v2f64v4f64_ins0(
; SSE-NEXT: [[TMP1:%.*]] = fneg nsz <2 x double> [[X:%.*]]
; SSE-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <4 x i32> <i32 poison, i32 1, i32 poison, i32 poison>
; SSE-NEXT: [[R:%.*]] = shufflevector <4 x double> [[Y:%.*]], <4 x double> [[TMP2]], <4 x i32> <i32 5, i32 1, i32 2, i32 3>
; SSE-NEXT: ret <4 x double> [[R]]
;
; AVX-LABEL: @ext1_v2f64v4f64_ins0(
; AVX-NEXT: [[E:%.*]] = extractelement <2 x double> [[X:%.*]], i32 1
; AVX-NEXT: [[N:%.*]] = fneg nsz double [[E]]
; AVX-NEXT: [[R:%.*]] = insertelement <4 x double> [[Y:%.*]], double [[N]], i32 0
; AVX-NEXT: ret <4 x double> [[R]]
;
%e = extractelement <2 x double> %x, i32 1
%n = fneg nsz double %e
Expand Down
Loading