Skip to content
41 changes: 22 additions & 19 deletions llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1401,26 +1401,25 @@ static Instruction *factorizeMinMaxTree(IntrinsicInst *II) {
/// try to shuffle after the intrinsic.
Instruction *
InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
// TODO: This should be extended to handle other intrinsics like fshl, ctpop,
// etc. Use llvm::isTriviallyVectorizable() and related to determine
// which intrinsics are safe to shuffle?
switch (II->getIntrinsicID()) {
case Intrinsic::smax:
case Intrinsic::smin:
case Intrinsic::umax:
case Intrinsic::umin:
case Intrinsic::fma:
case Intrinsic::fshl:
case Intrinsic::fshr:
break;
default:
if (!isTriviallyVectorizable(II->getIntrinsicID()))
return nullptr;

assert(isSafeToSpeculativelyExecute(II) &&
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If the call base has noundef on the return value, it is not safe to be speculatively executed.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is safe in the sense of isSafeToSpeculativelyExecute though, right? We create a new call without the attribute.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I mean this assertion is not needed.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this be turned into just a check then? I could imagine at some point we might want to mark e.g. udiv.fix as trivially vectorizable even if it's not speculatable. In which case we would not be able to perform this combine.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we just check Callee->isSpeculatable()?

Copy link
Contributor Author

@lukel97 lukel97 May 30, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about noundef etc? isSafeToSpeculativelyExecute also checks hasUBImplyingAttrs

EDIT: I see by default IgnoreUBImplyingAttrs is actually true

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As we finally create a new intrinsic call instead of reusing/in-place modifying the original one, noundef attributes are ignorable.

"Trivially vectorizable but not safe to speculatively execute?");

// fabs is canonicalized to fabs (shuffle ...) in foldShuffleOfUnaryOps, so
// avoid undoing it.
if (match(II, m_FAbs(m_Value())))
return nullptr;
}

Value *X;
Constant *C;
ArrayRef<int> Mask;
auto *NonConstArg = find_if_not(II->args(), IsaPred<Constant>);
auto *NonConstArg = find_if_not(II->args(), [&II](Use &Arg) {
return isa<Constant>(Arg.get()) ||
isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
Arg.getOperandNo(), nullptr);
});
if (!NonConstArg ||
!match(NonConstArg, m_Shuffle(m_Value(X), m_Poison(), m_Mask(Mask))))
return nullptr;
Expand All @@ -1432,11 +1431,15 @@ InstCombinerImpl::foldShuffledIntrinsicOperands(IntrinsicInst *II) {
// See if all arguments are shuffled with the same mask.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pre-existing, but the one use check above should be about the shuffle operands only, right? It doesn't help us if a constant argument is one-use (which it always is).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point, I went poking around though and it looks like constants don't return true for hasOneUse(). They seem to show up as 0 uses, so it happens to work today by coincidence.

Will fix anyway since I think with this patch we also need to worry about non-constant scalar args.

SmallVector<Value *, 4> NewArgs;
Type *SrcTy = X->getType();
for (Value *Arg : II->args()) {
if (match(Arg, m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
X->getType() == SrcTy)
for (Use &Arg : II->args()) {
if (isVectorIntrinsicWithScalarOpAtArg(II->getIntrinsicID(),
Arg.getOperandNo(), nullptr))
NewArgs.push_back(Arg);
else if (match(&Arg,
m_Shuffle(m_Value(X), m_Poison(), m_SpecificMask(Mask))) &&
X->getType() == SrcTy)
NewArgs.push_back(X);
else if (match(Arg, m_ImmConstant(C))) {
else if (match(&Arg, m_ImmConstant(C))) {
// If it's a constant, try find the constant that would be shuffled to C.
if (Constant *ShuffledC =
unshuffleConstant(Mask, C, cast<VectorType>(SrcTy)))
Expand Down
11 changes: 11 additions & 0 deletions llvm/test/Transforms/InstCombine/abs-1.ll
Original file line number Diff line number Diff line change
Expand Up @@ -978,3 +978,14 @@ define i32 @abs_diff_signed_slt_no_nsw_swap(i32 %a, i32 %b) {
%cond = select i1 %cmp, i32 %sub_ba, i32 %sub_ab
ret i32 %cond
}

define <2 x i32> @abs_unary_shuffle_ops(<2 x i32> %x) {
; CHECK-LABEL: @abs_unary_shuffle_ops(
; CHECK-NEXT: [[R2:%.*]] = call <2 x i32> @llvm.abs.v2i32(<2 x i32> [[R1:%.*]], i1 false)
; CHECK-NEXT: [[R:%.*]] = shufflevector <2 x i32> [[R2]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: ret <2 x i32> [[R]]
;
%a = shufflevector <2 x i32> %x, <2 x i32> poison, <2 x i32> <i32 1, i32 0>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, what happens if you add an extra use to this shufflevector? I think it will fold thanks to the i1 false argument.

%r = call <2 x i32> @llvm.abs(<2 x i32> %a, i1 false)
ret <2 x i32> %r
}
13 changes: 13 additions & 0 deletions llvm/test/Transforms/InstCombine/fma.ll
Original file line number Diff line number Diff line change
Expand Up @@ -972,6 +972,19 @@ define <2 x half> @fma_negone_vec_partial_undef(<2 x half> %x, <2 x half> %y) {
ret <2 x half> %sub
}

define <2 x float> @fmuladd_unary_shuffle_ops(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
; CHECK-LABEL: @fmuladd_unary_shuffle_ops(
; CHECK-NEXT: [[R:%.*]] = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[A:%.*]], <2 x float> [[B:%.*]], <2 x float> [[C:%.*]])
; CHECK-NEXT: [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: ret <2 x float> [[R1]]
;
%a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%b = shufflevector <2 x float> %y, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%c = shufflevector <2 x float> %z, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%r = call <2 x float> @llvm.fmuladd(<2 x float> %a, <2 x float> %b, <2 x float> %c)
ret <2 x float> %r
}

; negative tests

define half @fma_non_negone(half %x, half %y) {
Expand Down
11 changes: 11 additions & 0 deletions llvm/test/Transforms/InstCombine/sqrt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,17 @@ define <2 x float> @sqrt_exp_vec(<2 x float> %x) {
ret <2 x float> %res
}

define <2 x float> @sqrt_unary_shuffle_ops(<2 x float> %x) {
; CHECK-LABEL: @sqrt_unary_shuffle_ops(
; CHECK-NEXT: [[R:%.*]] = call <2 x float> @llvm.sqrt.v2f32(<2 x float> [[A:%.*]])
; CHECK-NEXT: [[R1:%.*]] = shufflevector <2 x float> [[R]], <2 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT: ret <2 x float> [[R1]]
;
%a = shufflevector <2 x float> %x, <2 x float> poison, <2 x i32> <i32 1, i32 0>
%r = call <2 x float> @llvm.sqrt(<2 x float> %a)
ret <2 x float> %r
}

declare i32 @foo(double)
declare double @sqrt(double) readnone
declare float @sqrtf(float)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -264,23 +264,17 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
; GFX8-NEXT: bb:
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This fold changes the vector length of intrinsics.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah it's a bit strange, but I guess that was the existing behaviour. Here's a test case in test/InstCombine/fma.ll:

define <2 x float> @fma_unary_shuffle_ops_narrowing(<3 x float> %x, <3 x float> %y, <3 x float> %z) {
; CHECK-LABEL: @fma_unary_shuffle_ops_narrowing(
; CHECK-NEXT:    [[B:%.*]] = shufflevector <3 x float> [[Y:%.*]], <3 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT:    call void @use_vec(<2 x float> [[B]])
; CHECK-NEXT:    [[TMP1:%.*]] = call nnan nsz <3 x float> @llvm.fma.v3f32(<3 x float> [[X:%.*]], <3 x float> [[Y]], <3 x float> [[Z:%.*]])
; CHECK-NEXT:    [[R:%.*]] = shufflevector <3 x float> [[TMP1]], <3 x float> poison, <2 x i32> <i32 1, i32 0>
; CHECK-NEXT:    ret <2 x float> [[R]]
;
  %a = shufflevector <3 x float> %x, <3 x float> poison, <2 x i32> <i32 1, i32 0>
  %b = shufflevector <3 x float> %y, <3 x float> poison, <2 x i32> <i32 1, i32 0>
  call void @use_vec(<2 x float> %b)
  %c = shufflevector <3 x float> %z, <3 x float> poison, <2 x i32> <i32 1, i32 0>
  %r = call nnan nsz <2 x float> @llvm.fma.v2f32(<2 x float> %a, <2 x float> %b, <2 x float> %c)
  ret <2 x float> %r
}

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd like to avoid this fold when shufflevectors change the type (as a follow-up). cc @RKSimon

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should still allow the case where we combine into a smaller type, e.g.

define <3 x float> @fma_unary_shuffle_ops_widening(<2 x float> %x, <2 x float> %y, <2 x float> %z) {
; CHECK-LABEL: @fma_unary_shuffle_ops_widening(
; CHECK-NEXT:    [[A:%.*]] = shufflevector <2 x float> [[X:%.*]], <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
; CHECK-NEXT:    call void @use_vec3(<3 x float> [[A]])
; CHECK-NEXT:    [[TMP1:%.*]] = call fast <2 x float> @llvm.fma.v2f32(<2 x float> [[X]], <2 x float> [[Y:%.*]], <2 x float> [[Z:%.*]])
; CHECK-NEXT:    [[R:%.*]] = shufflevector <2 x float> [[TMP1]], <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
; CHECK-NEXT:    ret <3 x float> [[R]]
;
  %a = shufflevector <2 x float> %x, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
  call void @use_vec3(<3 x float> %a)
  %b = shufflevector <2 x float> %y, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
  %c = shufflevector <2 x float> %z, <2 x float> poison, <3 x i32> <i32 1, i32 0, i32 1>
  %r = call fast <3 x float> @llvm.fma.v3f32(<3 x float> %a, <3 x float> %b, <3 x float> %c)
  ret <3 x float> %r
}

I agree either way, this would make it more consistent with the binop combine in InstCombinerImpl::foldVectorBinop which currently only allows the same type for two non-constant operands. For a constant operand, it looks like it's allowed to narrow it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we should not be doing length changes without cost modeling...

; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
;
; GFX9-LABEL: @uadd_sat_v3i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
;
Expand Down Expand Up @@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
;
; GFX9-LABEL: @uadd_sat_v4i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
;
bb:
Expand Down
30 changes: 10 additions & 20 deletions llvm/test/Transforms/SLPVectorizer/AMDGPU/add_sub_sat.ll
Original file line number Diff line number Diff line change
Expand Up @@ -264,23 +264,17 @@ define <3 x i16> @uadd_sat_v3i16(<3 x i16> %arg0, <3 x i16> %arg1) {
; GFX8-NEXT: bb:
; GFX8-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX8-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
; GFX8-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX8-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX8-NEXT: ret <3 x i16> [[INS_2]]
;
; GFX9-LABEL: @uadd_sat_v3i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[ARG0_2:%.*]] = extractelement <3 x i16> [[ARG0:%.*]], i64 2
; GFX9-NEXT: [[ARG1_2:%.*]] = extractelement <3 x i16> [[ARG1:%.*]], i64 2
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <3 x i16> [[ARG0]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <3 x i16> [[ARG1]], <3 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = call <3 x i16> @llvm.uadd.sat.v3i16(<3 x i16> [[ARG0]], <3 x i16> [[ARG1]])
; GFX9-NEXT: [[ADD_2:%.*]] = call i16 @llvm.uadd.sat.i16(i16 [[ARG0_2]], i16 [[ARG1_2]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> poison, <3 x i32> <i32 0, i32 1, i32 poison>
; GFX9-NEXT: [[INS_2:%.*]] = insertelement <3 x i16> [[TMP3]], i16 [[ADD_2]], i64 2
; GFX9-NEXT: ret <3 x i16> [[INS_2]]
;
Expand Down Expand Up @@ -323,24 +317,20 @@ define <4 x i16> @uadd_sat_v4i16(<4 x i16> %arg0, <4 x i16> %arg1) {
;
; GFX8-LABEL: @uadd_sat_v4i16(
; GFX8-NEXT: bb:
; GFX8-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX8-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX8-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX8-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX8-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX8-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX8-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX8-NEXT: ret <4 x i16> [[INS_31]]
;
; GFX9-LABEL: @uadd_sat_v4i16(
; GFX9-NEXT: bb:
; GFX9-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> [[ARG0:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP1:%.*]] = shufflevector <4 x i16> [[ARG1:%.*]], <4 x i16> poison, <2 x i32> <i32 0, i32 1>
; GFX9-NEXT: [[TMP2:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]])
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[ARG0]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP0:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0:%.*]], <4 x i16> [[ARG2:%.*]])
; GFX9-NEXT: [[ARG1:%.*]] = call <4 x i16> @llvm.uadd.sat.v4i16(<4 x i16> [[ARG0]], <4 x i16> [[ARG2]])
; GFX9-NEXT: [[TMP4:%.*]] = shufflevector <4 x i16> [[ARG1]], <4 x i16> poison, <2 x i32> <i32 2, i32 3>
; GFX9-NEXT: [[TMP5:%.*]] = call <2 x i16> @llvm.uadd.sat.v2i16(<2 x i16> [[TMP3]], <2 x i16> [[TMP4]])
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <2 x i16> [[TMP2]], <2 x i16> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
; GFX9-NEXT: [[TMP3:%.*]] = shufflevector <2 x i16> [[TMP4]], <2 x i16> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
; GFX9-NEXT: [[INS_31:%.*]] = shufflevector <4 x i16> [[TMP0]], <4 x i16> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 4, i32 5>
; GFX9-NEXT: ret <4 x i16> [[INS_31]]
;
bb:
Expand Down
Loading
Loading