Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 47 additions & 6 deletions llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23783,7 +23783,8 @@ class HorizontalReduction {

/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
const TargetLibraryInfo &TLI, AssumptionCache *AC) {
const TargetLibraryInfo &TLI, AssumptionCache *AC,
DominatorTree &DT) {
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
Expand Down Expand Up @@ -23900,8 +23901,44 @@ class HorizontalReduction {
bool CheckForReusedReductionOps = false;
// Try to vectorize elements based on their type.
SmallVector<InstructionsState> States;
for (ArrayRef<Value *> RV : ReducedVals)
SmallVector<SmallVector<Value *>> LocalReducedVals;
for (ArrayRef<Value *> RV : ReducedVals) {
// Loads are not very compatible with undefs.
if (isa<UndefValue>(RV.front()) &&
(States.empty() || !States.back() ||
States.back().getOpcode() == Instruction::Load)) {
LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
States.push_back(InstructionsState::invalid());
continue;
}
if (!LocalReducedVals.empty() &&
isa<UndefValue>(LocalReducedVals.back().front()) &&
isa<LoadInst>(RV.front())) {
LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
States.push_back(getSameOpcode(RV, TLI));
continue;
}
SmallVector<Value *> Ops;
if (!LocalReducedVals.empty())
Ops = LocalReducedVals.back();
Ops.append(RV.begin(), RV.end());
InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
InstructionsState OpS =
Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
if (LocalReducedVals.empty()) {
LocalReducedVals.push_back(Ops);
States.push_back(OpS);
continue;
}
if (OpS) {
LocalReducedVals.back().swap(Ops);
States.back() = OpS;
continue;
}
LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
States.push_back(getSameOpcode(RV, TLI));
}
ReducedVals.swap(LocalReducedVals);
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
InstructionsState S = States[I];
Expand All @@ -23916,8 +23953,10 @@ class HorizontalReduction {
// Also check if the instruction was folded to constant/other value.
auto *Inst = dyn_cast<Instruction>(RdxVal);
if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
(!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
(S && !Inst))
(!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
!S.isCopyableElement(Inst)))) ||
(S && !Inst && !isa<PoisonValue>(RdxVal) &&
!S.isCopyableElement(RdxVal)))
continue;
Candidates.push_back(RdxVal);
TrackedToOrig.try_emplace(RdxVal, ReducedVal);
Expand Down Expand Up @@ -24503,6 +24542,8 @@ class HorizontalReduction {
// Scalar cost is repeated for N-1 elements.
int Cnt = ReducedVals.size();
for (Value *RdxVal : ReducedVals) {
if (!isa<Instruction>(RdxVal))
continue;
if (Cnt == 1)
break;
--Cnt;
Expand Down Expand Up @@ -25247,7 +25288,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
HorizontalReduction HorRdx;
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
Expand Down Expand Up @@ -25392,7 +25433,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (RedCost >= ScalarCost)
return false;

return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
};
if (Candidates.size() == 1)
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
Expand Down
36 changes: 20 additions & 16 deletions llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
Original file line number Diff line number Diff line change
Expand Up @@ -5,34 +5,38 @@
define void @Test(i32) {
; CHECK-LABEL: @Test(
; CHECK-NEXT: entry:
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
; CHECK-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
; CHECK-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
; CHECK-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
; CHECK-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
; CHECK-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
; CHECK-NEXT: br label [[LOOP]]
;
; FORCE_REDUCTION-LABEL: @Test(
; FORCE_REDUCTION-NEXT: entry:
; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]]
; FORCE_REDUCTION: loop:
; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
; FORCE_REDUCTION-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
; FORCE_REDUCTION-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
; FORCE_REDUCTION-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
; FORCE_REDUCTION-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
; FORCE_REDUCTION-NEXT: br label [[LOOP]]
;
entry:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@ define i8 @test() {
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 0 to i8
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 0 to i8
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 0 to i8
; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer)
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i8 [[TMP0]], [[TMP2]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i8 [[OP_RDX1]], [[TMP0]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i8 [[OP_RDX2]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX5:%.*]] = or i8 [[OP_RDX3]], [[TMP3]]
; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer)
; CHECK-NEXT: [[OP_RDX4:%.*]] = or i8 [[OP_RDX5]], [[TMP4]]
; CHECK-NEXT: ret i8 [[OP_RDX4]]
;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@ define i32 @foo() {
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT: [[RDX_OP:%.*]] = mul <4 x i32> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[RDX_OP]])
; CHECK-NEXT: [[OP_RDX7:%.*]] = mul i32 0, [[TMP2]]
; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 0, [[TMP5]]
; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0
; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX1]], [[TMP2]]
; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX7]], [[OP_RDX]]
; CHECK-NEXT: ret i32 [[OP_RDX6]]
;
bb:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ define void @test() {
; CHECK-NEXT: br i1 false, label [[PH:%.*]], label [[EXIT:%.*]]
; CHECK: ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer)
; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 0, [[TMP0]]
; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 [[TMP0]], 0
; CHECK-NEXT: [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], 0
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,16 +7,17 @@ define <4 x i16> @test() {
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer
; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]]
; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
; CHECK-NEXT: [[RDX_OP1:%.*]] = or <16 x i16> [[RDX_OP]], zeroinitializer
; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
Expand Down
56 changes: 35 additions & 21 deletions llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll
Original file line number Diff line number Diff line change
@@ -1,27 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %}
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %}
; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=X86 %}
; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s | FileCheck %s --check-prefix=AARCH64 %}

define i64 @src(i32 %a) {
; CHECK-LABEL: define i64 @src(
; CHECK-SAME: i32 [[A:%.*]]) {
; CHECK-NEXT: [[ENTRY:.*:]]
; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[A]] to i64
; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0
; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP18]], i32 1
; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> <i64 poison, i64 4294967297>, i64 [[TMP17]], i32 0
; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP9]], [[TMP10]]
; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP12]], [[TMP13]]
; CHECK-NEXT: ret i64 [[TMP21]]
; X86-LABEL: define i64 @src(
; X86-SAME: i32 [[A:%.*]]) {
; X86-NEXT: [[ENTRY:.*:]]
; X86-NEXT: [[TMP0:%.*]] = sext i32 [[A]] to i64
; X86-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
; X86-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
; X86-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
; X86-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
; X86-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
; X86-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; X86-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; X86-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
; X86-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
; X86-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297
; X86-NEXT: [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]]
; X86-NEXT: ret i64 [[OP_RDX1]]
;
; AARCH64-LABEL: define i64 @src(
; AARCH64-SAME: i32 [[A:%.*]]) {
; AARCH64-NEXT: [[ENTRY:.*:]]
; AARCH64-NEXT: [[TMP0:%.*]] = sext i32 [[A]] to i64
; AARCH64-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
; AARCH64-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
; AARCH64-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
; AARCH64-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
; AARCH64-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
; AARCH64-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
; AARCH64-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
; AARCH64-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297
; AARCH64-NEXT: [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]]
; AARCH64-NEXT: ret i64 [[OP_RDX1]]
;
entry:
%0 = sext i32 %a to i64
Expand Down