-
Notifications
You must be signed in to change notification settings - Fork 14.9k
[SLP] Support for copyables in the reduced values #153589
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[SLP] Support for copyables in the reduced values #153589
Conversation
Created using spr 1.3.5
@llvm/pr-subscribers-backend-risc-v @llvm/pr-subscribers-llvm-transforms Author: Alexey Bataev (alexey-bataev) ChangesCurrently reductions can handles only same/alternate instructions, Full diff: https://github.com/llvm/llvm-project/pull/153589.diff 7 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index a3cb4d138789c..3e64917bd7739 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -23783,7 +23783,8 @@ class HorizontalReduction {
/// Attempt to vectorize the tree found by matchAssociativeReduction.
Value *tryToReduce(BoUpSLP &V, const DataLayout &DL, TargetTransformInfo *TTI,
- const TargetLibraryInfo &TLI, AssumptionCache *AC) {
+ const TargetLibraryInfo &TLI, AssumptionCache *AC,
+ DominatorTree &DT) {
constexpr unsigned RegMaxNumber = 4;
constexpr unsigned RedValsMaxNumber = 128;
// If there are a sufficient number of reduction values, reduce
@@ -23900,8 +23901,44 @@ class HorizontalReduction {
bool CheckForReusedReductionOps = false;
// Try to vectorize elements based on their type.
SmallVector<InstructionsState> States;
- for (ArrayRef<Value *> RV : ReducedVals)
+ SmallVector<SmallVector<Value*>> LocalReducedVals;
+ for (ArrayRef<Value *> RV : ReducedVals) {
+ // Loads are not very compatible with undefs.
+ if (isa<UndefValue>(RV.front()) &&
+ (States.empty() || !States.back() ||
+ States.back().getOpcode() == Instruction::Load)) {
+ LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
+ States.push_back(InstructionsState::invalid());
+ continue;
+ }
+ if (!LocalReducedVals.empty() &&
+ isa<UndefValue>(LocalReducedVals.back().front()) &&
+ isa<LoadInst>(RV.front())) {
+ LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
+ States.push_back(getSameOpcode(RV, TLI));
+ continue;
+ }
+ SmallVector<Value *> Ops;
+ if (!LocalReducedVals.empty())
+ Ops = LocalReducedVals.back();
+ Ops.append(RV.begin(), RV.end());
+ InstructionsCompatibilityAnalysis Analysis(DT, DL, *TTI, TLI);
+ InstructionsState OpS =
+ Analysis.buildInstructionsState(Ops, V, VectorizeCopyableElements);
+ if (LocalReducedVals.empty()) {
+ LocalReducedVals.push_back(Ops);
+ States.push_back(OpS);
+ continue;
+ }
+ if (OpS) {
+ LocalReducedVals.back().swap(Ops);
+ States.back() = OpS;
+ continue;
+ }
+ LocalReducedVals.emplace_back().append(RV.begin(), RV.end());
States.push_back(getSameOpcode(RV, TLI));
+ }
+ ReducedVals.swap(LocalReducedVals);
for (unsigned I = 0, E = ReducedVals.size(); I < E; ++I) {
ArrayRef<Value *> OrigReducedVals = ReducedVals[I];
InstructionsState S = States[I];
@@ -23916,8 +23953,10 @@ class HorizontalReduction {
// Also check if the instruction was folded to constant/other value.
auto *Inst = dyn_cast<Instruction>(RdxVal);
if ((Inst && isVectorLikeInstWithConstOps(Inst) &&
- (!S || !S.getMatchingMainOpOrAltOp(Inst))) ||
- (S && !Inst))
+ (!S || (!S.getMatchingMainOpOrAltOp(Inst) &&
+ !S.isCopyableElement(Inst)))) ||
+ (S && !Inst && !isa<PoisonValue>(RdxVal) &&
+ !S.isCopyableElement(RdxVal)))
continue;
Candidates.push_back(RdxVal);
TrackedToOrig.try_emplace(RdxVal, ReducedVal);
@@ -24503,6 +24542,8 @@ class HorizontalReduction {
// Scalar cost is repeated for N-1 elements.
int Cnt = ReducedVals.size();
for (Value *RdxVal : ReducedVals) {
+ if (!isa<Instruction>(RdxVal))
+ continue;
if (Cnt == 1)
break;
--Cnt;
@@ -25247,7 +25288,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
HorizontalReduction HorRdx;
if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
return nullptr;
- return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
+ return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC, *DT);
};
auto TryAppendToPostponedInsts = [&](Instruction *FutureSeed) {
if (TryOperandsAsNewSeeds && FutureSeed == Root) {
@@ -25392,7 +25433,7 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
if (RedCost >= ScalarCost)
return false;
- return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC) != nullptr;
+ return HorRdx.tryToReduce(R, *DL, &TTI, *TLI, AC, *DT) != nullptr;
};
if (Candidates.size() == 1)
return TryToReduce(I, {Op0, Op1}) || tryToVectorizeList({Op0, Op1}, R);
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
index 143e09374a891..e3279290c3074 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/PR39774.ll
@@ -5,34 +5,38 @@
define void @Test(i32) {
; CHECK-LABEL: @Test(
; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
; CHECK-NEXT: br label [[LOOP:%.*]]
; CHECK: loop:
; CHECK-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; CHECK-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
+; CHECK-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
+; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
+; CHECK-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
-; CHECK-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
-; CHECK-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
-; CHECK-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
+; CHECK-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
+; CHECK-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
+; CHECK-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
; CHECK-NEXT: br label [[LOOP]]
;
; FORCE_REDUCTION-LABEL: @Test(
; FORCE_REDUCTION-NEXT: entry:
+; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = insertelement <8 x i32> poison, i32 [[TMP0:%.*]], i32 0
; FORCE_REDUCTION-NEXT: br label [[LOOP:%.*]]
; FORCE_REDUCTION: loop:
; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US:%.*]] = phi i32 [ [[VAL_43:%.*]], [[LOOP]] ], [ 0, [[ENTRY:%.*]] ]
-; FORCE_REDUCTION-NEXT: [[TMP1:%.*]] = phi <2 x i32> [ [[TMP6:%.*]], [[LOOP]] ], [ zeroinitializer, [[ENTRY]] ]
-; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = shufflevector <2 x i32> [[TMP1]], <2 x i32> poison, <8 x i32> <i32 0, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
-; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], <i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529, i32 13685>
+; FORCE_REDUCTION-NEXT: [[LOCAL_8_43_US1:%.*]] = phi i32 [ [[VAL_44:%.*]], [[LOOP]] ], [ 0, [[ENTRY]] ]
+; FORCE_REDUCTION-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> [[TMP1]], i32 [[LOCAL_8_43_US]], i32 1
+; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[LOCAL_8_43_US1]], i32 2
+; FORCE_REDUCTION-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[TMP5]], <8 x i32> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; FORCE_REDUCTION-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP6]], <i32 0, i32 0, i32 55, i32 285, i32 1240, i32 1496, i32 8555, i32 12529>
+; FORCE_REDUCTION-NEXT: [[VAL_41:%.*]] = add i32 [[LOCAL_8_43_US1]], 13685
; FORCE_REDUCTION-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.and.v8i32(<8 x i32> [[TMP3]])
-; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP0:%.*]], [[TMP4]]
-; FORCE_REDUCTION-NEXT: [[OP_RDX1:%.*]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US]]
-; FORCE_REDUCTION-NEXT: [[VAL_43]] = add i32 [[LOCAL_8_43_US]], 14910
-; FORCE_REDUCTION-NEXT: [[TMP5:%.*]] = insertelement <2 x i32> poison, i32 [[OP_RDX1]], i32 0
-; FORCE_REDUCTION-NEXT: [[TMP6]] = insertelement <2 x i32> [[TMP5]], i32 [[VAL_43]], i32 1
+; FORCE_REDUCTION-NEXT: [[OP_RDX:%.*]] = and i32 [[TMP4]], [[VAL_41]]
+; FORCE_REDUCTION-NEXT: [[VAL_43]] = and i32 [[OP_RDX]], [[LOCAL_8_43_US1]]
+; FORCE_REDUCTION-NEXT: [[VAL_44]] = add i32 [[LOCAL_8_43_US1]], 14910
; FORCE_REDUCTION-NEXT: br label [[LOOP]]
;
entry:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
index 79c6c6b3f046f..d7c63457bf5c1 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/multi-tracked-reduced-value.ll
@@ -8,11 +8,11 @@ define i8 @test() {
; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 0 to i8
; CHECK-NEXT: [[TMP2:%.*]] = trunc i32 0 to i8
; CHECK-NEXT: [[TMP3:%.*]] = trunc i32 0 to i8
+; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer)
; CHECK-NEXT: [[OP_RDX1:%.*]] = or i8 [[TMP0]], [[TMP2]]
; CHECK-NEXT: [[OP_RDX2:%.*]] = or i8 [[OP_RDX1]], [[TMP0]]
; CHECK-NEXT: [[OP_RDX3:%.*]] = or i8 [[OP_RDX2]], [[TMP1]]
; CHECK-NEXT: [[OP_RDX5:%.*]] = or i8 [[OP_RDX3]], [[TMP3]]
-; CHECK-NEXT: [[TMP4:%.*]] = call i8 @llvm.vector.reduce.or.v8i8(<8 x i8> zeroinitializer)
; CHECK-NEXT: [[OP_RDX4:%.*]] = or i8 [[OP_RDX5]], [[TMP4]]
; CHECK-NEXT: ret i8 [[OP_RDX4]]
;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
index 1cf837df719ec..56919ae0ffc90 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-extracted-value.ll
@@ -10,9 +10,9 @@ define i32 @foo() {
; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP3]], zeroinitializer
; CHECK-NEXT: [[RDX_OP:%.*]] = mul <4 x i32> [[TMP4]], zeroinitializer
; CHECK-NEXT: [[TMP5:%.*]] = call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> [[RDX_OP]])
+; CHECK-NEXT: [[OP_RDX7:%.*]] = mul i32 0, [[TMP2]]
; CHECK-NEXT: [[OP_RDX:%.*]] = mul i32 0, [[TMP5]]
-; CHECK-NEXT: [[OP_RDX1:%.*]] = mul i32 [[OP_RDX]], 0
-; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX1]], [[TMP2]]
+; CHECK-NEXT: [[OP_RDX6:%.*]] = mul i32 [[OP_RDX7]], [[OP_RDX]]
; CHECK-NEXT: ret i32 [[OP_RDX6]]
;
bb:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
index 6d6dd502415e5..dd3dee6910a4c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/reduction-value-in-tree.ll
@@ -6,7 +6,7 @@ define void @test() {
; CHECK-NEXT: br i1 false, label [[PH:%.*]], label [[EXIT:%.*]]
; CHECK: ph:
; CHECK-NEXT: [[TMP0:%.*]] = call i8 @llvm.vector.reduce.and.v8i8(<8 x i8> zeroinitializer)
-; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 0, [[TMP0]]
+; CHECK-NEXT: [[OP_RDX:%.*]] = and i8 [[TMP0]], 0
; CHECK-NEXT: [[OP_RDX1:%.*]] = and i8 [[OP_RDX]], 0
; CHECK-NEXT: br label [[EXIT]]
; CHECK: exit:
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
index f7811aba5ab5f..2774d5f3b64e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/revec-reduced-value-vectorized-later.ll
@@ -7,16 +7,17 @@ define <4 x i16> @test() {
; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <4 x i16> zeroinitializer, <4 x i16> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
; CHECK-NEXT: [[TMP1:%.*]] = add <16 x i16> [[TMP0]], zeroinitializer
; CHECK-NEXT: [[RDX_OP:%.*]] = or <16 x i16> zeroinitializer, [[TMP1]]
-; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
+; CHECK-NEXT: [[RDX_OP1:%.*]] = or <16 x i16> [[RDX_OP]], zeroinitializer
+; CHECK-NEXT: [[TMP36:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 0, i32 4, i32 8, i32 12>
; CHECK-NEXT: [[TMP37:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP36]])
; CHECK-NEXT: [[TMP38:%.*]] = insertelement <4 x i16> poison, i16 [[TMP37]], i64 0
-; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
+; CHECK-NEXT: [[TMP39:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 1, i32 5, i32 9, i32 13>
; CHECK-NEXT: [[TMP40:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP39]])
; CHECK-NEXT: [[TMP41:%.*]] = insertelement <4 x i16> [[TMP38]], i16 [[TMP40]], i64 1
-; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
+; CHECK-NEXT: [[TMP42:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 2, i32 6, i32 10, i32 14>
; CHECK-NEXT: [[TMP43:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP42]])
; CHECK-NEXT: [[TMP44:%.*]] = insertelement <4 x i16> [[TMP41]], i16 [[TMP43]], i64 2
-; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
+; CHECK-NEXT: [[TMP45:%.*]] = shufflevector <16 x i16> [[RDX_OP1]], <16 x i16> poison, <4 x i32> <i32 3, i32 7, i32 11, i32 15>
; CHECK-NEXT: [[TMP46:%.*]] = call i16 @llvm.vector.reduce.or.v4i16(<4 x i16> [[TMP45]])
; CHECK-NEXT: [[TMP47:%.*]] = insertelement <4 x i16> [[TMP44]], i16 [[TMP46]], i64 3
; CHECK-NEXT: [[OP_RDX9:%.*]] = or <4 x i16> [[TMP47]], zeroinitializer
diff --git a/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll b/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll
index 62417268bf3d0..7ed1edc278806 100644
--- a/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll
+++ b/llvm/test/Transforms/SLPVectorizer/operand-is-reduced-val.ll
@@ -1,27 +1,41 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %}
-; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s -slp-threshold=-10 | FileCheck %s %}
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s --check-prefix=X86 %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s | FileCheck %s --check-prefix=AARCH64 %}
define i64 @src(i32 %a) {
-; CHECK-LABEL: define i64 @src(
-; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT: [[ENTRY:.*:]]
-; CHECK-NEXT: [[TMP17:%.*]] = sext i32 [[A]] to i64
-; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
-; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
-; CHECK-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
-; CHECK-NEXT: [[TMP6:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
-; CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP6]])
-; CHECK-NEXT: [[TMP16:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[TMP4]])
-; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x i64> poison, i64 [[TMP16]], i32 0
-; CHECK-NEXT: [[TMP9:%.*]] = insertelement <2 x i64> [[TMP8]], i64 [[TMP18]], i32 1
-; CHECK-NEXT: [[TMP10:%.*]] = insertelement <2 x i64> <i64 poison, i64 4294967297>, i64 [[TMP17]], i32 0
-; CHECK-NEXT: [[TMP11:%.*]] = add <2 x i64> [[TMP9]], [[TMP10]]
-; CHECK-NEXT: [[TMP12:%.*]] = extractelement <2 x i64> [[TMP11]], i32 0
-; CHECK-NEXT: [[TMP13:%.*]] = extractelement <2 x i64> [[TMP11]], i32 1
-; CHECK-NEXT: [[TMP21:%.*]] = add i64 [[TMP12]], [[TMP13]]
-; CHECK-NEXT: ret i64 [[TMP21]]
+; X86-LABEL: define i64 @src(
+; X86-SAME: i32 [[A:%.*]]) {
+; X86-NEXT: [[ENTRY:.*:]]
+; X86-NEXT: [[TMP0:%.*]] = sext i32 [[A]] to i64
+; X86-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; X86-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; X86-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; X86-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
+; X86-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
+; X86-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; X86-NEXT: [[TMP8:%.*]] = shufflevector <8 x i64> [[TMP6]], <8 x i64> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; X86-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
+; X86-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297
+; X86-NEXT: [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]]
+; X86-NEXT: ret i64 [[OP_RDX1]]
+;
+; AARCH64-LABEL: define i64 @src(
+; AARCH64-SAME: i32 [[A:%.*]]) {
+; AARCH64-NEXT: [[ENTRY:.*:]]
+; AARCH64-NEXT: [[TMP0:%.*]] = sext i32 [[A]] to i64
+; AARCH64-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A]], i32 0
+; AARCH64-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
+; AARCH64-NEXT: [[TMP3:%.*]] = sext <4 x i32> [[TMP2]] to <4 x i64>
+; AARCH64-NEXT: [[TMP4:%.*]] = add nsw <4 x i64> [[TMP3]], splat (i64 4294967297)
+; AARCH64-NEXT: [[TMP5:%.*]] = and <4 x i64> [[TMP4]], splat (i64 1)
+; AARCH64-NEXT: [[TMP6:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
+; AARCH64-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> [[TMP4]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AARCH64-NEXT: [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v8i64(<8 x i64> [[TMP8]])
+; AARCH64-NEXT: [[OP_RDX:%.*]] = add i64 [[TMP9]], 4294967297
+; AARCH64-NEXT: [[OP_RDX1:%.*]] = add i64 [[OP_RDX]], [[TMP0]]
+; AARCH64-NEXT: ret i64 [[OP_RDX1]]
;
entry:
%0 = sext i32 %a to i64
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Created using spr 1.3.5
Ping! |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please can you update against trunk?
SmallVector<InstructionsState> States; | ||
for (ArrayRef<Value *> RV : ReducedVals) | ||
SmallVector<SmallVector<Value *>> LocalReducedVals; | ||
for (ArrayRef<Value *> RV : ReducedVals) { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Comments?
Created using spr 1.3.7
Currently reductions can handles only same/alternate instructions,
skipping potential support for copyables. Patch adds support for
copyables in the reduced values.