-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[LV] Check all users of partial reductions in chain have same scale. #162822
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Check that all partial reductions in a chain are only used by other partial reductions with the same scale factor. Otherwise we end up creating users of scaled reductions where the types of the other operands don't match. A similar issue was addressed in llvm#158603, but misses the chained cases. Fixes llvm#162530.
@llvm/pr-subscribers-vectorizers @llvm/pr-subscribers-llvm-transforms Author: Florian Hahn (fhahn) ChangesCheck that all partial reductions in a chain are only used by other partial reductions with the same scale factor. Otherwise we end up creating users of scaled reductions where the types of the other operands don't match. A similar issue was addressed in #158603, but misses the chained cases. Fixes #162530. Full diff: https://github.com/llvm/llvm-project/pull/162822.diff 3 Files Affected:
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index e62d57e6920b7..8856857e06964 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -7910,6 +7910,29 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
(!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
}
+
+ // Check that all partial reductions in a chain are only used by other partial
+ // reductions with the same scale factor. Otherwise we end up creating users
+ // of scaled reductions where the types of the other operands don't match.
+ auto AllUsersPartialRdx = [this](Instruction *I, unsigned Scale) {
+ return all_of(I->users(), [Scale, this](const User *U) {
+ auto *UI = cast<Instruction>(U);
+
+ if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
+ return all_of(UI->users(), [Scale, this](const User *U) {
+ auto *UI = cast<Instruction>(U);
+ return ScaledReductionMap.lookup_or(UI, 0) == Scale;
+ });
+ }
+
+ return ScaledReductionMap.lookup_or(UI, 0) == Scale ||
+ !OrigLoop->contains(UI->getParent());
+ });
+ };
+ for (const auto &[Chain, Scale] : PartialReductionChains) {
+ if (!AllUsersPartialRdx(Chain.Reduction, Scale))
+ ScaledReductionMap.erase(Chain.Reduction);
+ }
}
bool VPRecipeBuilder::getScaledReductions(
@@ -8093,11 +8116,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
return tryToWidenMemory(Instr, Operands, Range);
- if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
- if (auto PartialRed =
- tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
- return PartialRed;
- }
+ if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
+ return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
if (!shouldWiden(Instr, Range))
return nullptr;
@@ -8131,9 +8151,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
isa<VPPartialReductionRecipe>(BinOpRecipe))
std::swap(BinOp, Accumulator);
- if (ScaleFactor !=
- vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
- return nullptr;
+ assert(ScaleFactor ==
+ vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
+ "all accumulators in chain must have same scale factor");
unsigned ReductionOpcode = Reduction->getOpcode();
if (ReductionOpcode == Instruction::Sub) {
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 5ae08393a1804..3dfa6df3313a5 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1361,132 +1361,6 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
}
-define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
-; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEON-NEXT: entry:
-; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-NEON: vector.ph:
-; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
-; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-NEON: vector.body:
-; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-NEON: middle.block:
-; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
-; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-;
-; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-NEXT: entry:
-; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE: vector.body:
-; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-NEXT: [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-NEXT: [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-;
-; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-MAXBW-NEXT: entry:
-; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-MAXBW-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
-; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE-MAXBW: vector.ph:
-; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
-; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: vector.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE-MAXBW: middle.block:
-; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-;
-entry:
- br label %loop
-
-loop:
- %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
- %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
- %l = load i8, ptr %ptr.iv, align 1
- %l.ext = zext i8 %l to i32
- %add = add i32 %red, %l.ext
- %red.next = add i32 %add, %offset
- %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
- %ec = icmp eq ptr %ptr.iv, %end
- br i1 %ec, label %exit, label %loop
-
-exit:
- ret i32 %red.next
-}
-
attributes #0 = { vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
new file mode 100644
index 0000000000000..5b5c40a83e11e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
+
+target triple = "arm64-apple-macosx"
+
+define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
+; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-NEON: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+ %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+ %l = load i8, ptr %ptr.iv, align 1
+ %l.ext = zext i8 %l to i32
+ %add = add i32 %red, %l.ext
+ %red.next = add i32 %add, %offset
+ %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+ %ec = icmp eq ptr %ptr.iv, %end
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i32 %red.next
+}
+
+
+define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B) #0 {
+; CHECK-NEON-LABEL: define i16 @test_incomplete_chain_without_mul(
+; CHECK-NEON-SAME: ptr noalias [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT: entry:
+; CHECK-NEON-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
+; CHECK-NEON: vector.ph:
+; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
+; CHECK-NEON: vector.body:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i16> [ zeroinitializer, [[VECTOR_MEMCHECK]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
+; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
+; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i16>
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15
+; CHECK-NEON-NEXT: store i16 [[TMP2]], ptr [[DST]], align 2
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = load i8, ptr [[B]], align 1
+; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[TMP3]], i64 0
+; CHECK-NEON-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT7]] to <16 x i16>
+; CHECK-NEON-NEXT: [[TMP5:%.*]] = add <16 x i16> [[VEC_PHI]], [[TMP4]]
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP5]], [[TMP1]]
+; CHECK-NEON-NEXT: [[TMP7]] = add <16 x i16> [[TMP6]], [[TMP4]]
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEON-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON: middle.block:
+; CHECK-NEON-NEXT: [[TMP9:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]])
+; CHECK-NEON-NEXT: br label [[SCALAR_PH:%.*]]
+; CHECK-NEON: scalar.ph:
+;
+entry:
+ br label %loop
+
+loop:
+ %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+ %red = phi i16 [ 0, %entry ], [ %red.next, %loop ]
+ %l.a = load i8, ptr %A, align 1
+ %a.ext = zext i8 %l.a to i16
+ store i16 %a.ext, ptr %dst, align 2
+ %l.b = load i8, ptr %B, align 1
+ %b.ext = zext i8 %l.b to i16
+ %add = add i16 %red, %b.ext
+ %add.1 = add i16 %add, %a.ext
+ %red.next = add i16 %add.1, %b.ext
+ %iv.next = add i64 %iv, 1
+ %ec = icmp ult i64 %iv, 1024
+ br i1 %ec, label %loop, label %exit
+
+exit:
+ ret i16 %red.next
+}
+
+attributes #0 = { "target-cpu"="grace" }
|
Check that all partial reductions in a chain are only used by other partial reductions with the same scale factor. Otherwise we end up creating users of scaled reductions where the types of the other operands don't match.
A similar issue was addressed in #158603, but misses the chained cases.
Fixes #162530.