Skip to content

Commit 76e9f47

Browse files
committed
[LV] Check all users of partial reductions in chain have same scale.
Check that all partial reductions in a chain are only used by other partial reductions with the same scale factor. Otherwise we end up creating users of scaled reductions where the types of the other operands don't match. A similar issue was addressed in llvm#158603, but misses the chained cases. Fixes llvm#162530.
1 parent aa406aa commit 76e9f47

File tree

3 files changed

+140
-134
lines changed

3 files changed

+140
-134
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7910,6 +7910,29 @@ void VPRecipeBuilder::collectScaledReductions(VFRange &Range) {
79107910
(!Chain.ExtendB || ExtendIsOnlyUsedByPartialReductions(Chain.ExtendB)))
79117911
ScaledReductionMap.try_emplace(Chain.Reduction, Pair.second);
79127912
}
7913+
7914+
// Check that all partial reductions in a chain are only used by other partial
7915+
// reductions with the same scale factor. Otherwise we end up creating users
7916+
// of scaled reductions where the types of the other operands don't match.
7917+
auto AllUsersPartialRdx = [this](Instruction *I, unsigned Scale) {
7918+
return all_of(I->users(), [Scale, this](const User *U) {
7919+
auto *UI = cast<Instruction>(U);
7920+
7921+
if (isa<PHINode>(UI) && UI->getParent() == OrigLoop->getHeader()) {
7922+
return all_of(UI->users(), [Scale, this](const User *U) {
7923+
auto *UI = cast<Instruction>(U);
7924+
return ScaledReductionMap.lookup_or(UI, 0) == Scale;
7925+
});
7926+
}
7927+
7928+
return ScaledReductionMap.lookup_or(UI, 0) == Scale ||
7929+
!OrigLoop->contains(UI->getParent());
7930+
});
7931+
};
7932+
for (const auto &[Chain, Scale] : PartialReductionChains) {
7933+
if (!AllUsersPartialRdx(Chain.Reduction, Scale))
7934+
ScaledReductionMap.erase(Chain.Reduction);
7935+
}
79137936
}
79147937

79157938
bool VPRecipeBuilder::getScaledReductions(
@@ -8093,11 +8116,8 @@ VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(VPSingleDefRecipe *R,
80938116
if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
80948117
return tryToWidenMemory(Instr, Operands, Range);
80958118

8096-
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr)) {
8097-
if (auto PartialRed =
8098-
tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value()))
8099-
return PartialRed;
8100-
}
8119+
if (std::optional<unsigned> ScaleFactor = getScalingForReduction(Instr))
8120+
return tryToCreatePartialReduction(Instr, Operands, ScaleFactor.value());
81018121

81028122
if (!shouldWiden(Instr, Range))
81038123
return nullptr;
@@ -8131,9 +8151,9 @@ VPRecipeBuilder::tryToCreatePartialReduction(Instruction *Reduction,
81318151
isa<VPPartialReductionRecipe>(BinOpRecipe))
81328152
std::swap(BinOp, Accumulator);
81338153

8134-
if (ScaleFactor !=
8135-
vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()))
8136-
return nullptr;
8154+
assert(ScaleFactor ==
8155+
vputils::getVFScaleFactor(Accumulator->getDefiningRecipe()) &&
8156+
"all accumulators in chain must have same scale factor");
81378157

81388158
unsigned ReductionOpcode = Reduction->getOpcode();
81398159
if (ReductionOpcode == Instruction::Sub) {

llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll

Lines changed: 0 additions & 126 deletions
Original file line numberDiff line numberDiff line change
@@ -1361,132 +1361,6 @@ for.body: ; preds = %for.body.preheader,
13611361
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
13621362
}
13631363

1364-
define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
1365-
; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
1366-
; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
1367-
; CHECK-NEON-NEXT: entry:
1368-
; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
1369-
; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
1370-
; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
1371-
; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
1372-
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
1373-
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1374-
; CHECK-NEON: vector.ph:
1375-
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
1376-
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
1377-
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
1378-
; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
1379-
; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
1380-
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
1381-
; CHECK-NEON: vector.body:
1382-
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1383-
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
1384-
; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
1385-
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
1386-
; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
1387-
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
1388-
; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
1389-
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
1390-
; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1391-
; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
1392-
; CHECK-NEON: middle.block:
1393-
; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
1394-
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1395-
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1396-
; CHECK-NEON: scalar.ph:
1397-
;
1398-
; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
1399-
; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
1400-
; CHECK-SVE-NEXT: entry:
1401-
; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
1402-
; CHECK-SVE-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
1403-
; CHECK-SVE-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
1404-
; CHECK-SVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
1405-
; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
1406-
; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
1407-
; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
1408-
; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1409-
; CHECK-SVE: vector.ph:
1410-
; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
1411-
; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
1412-
; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
1413-
; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
1414-
; CHECK-SVE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
1415-
; CHECK-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
1416-
; CHECK-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
1417-
; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
1418-
; CHECK-SVE: vector.body:
1419-
; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1420-
; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
1421-
; CHECK-SVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
1422-
; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
1423-
; CHECK-SVE-NEXT: [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
1424-
; CHECK-SVE-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
1425-
; CHECK-SVE-NEXT: [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
1426-
; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1427-
; CHECK-SVE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1428-
; CHECK-SVE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
1429-
; CHECK-SVE: middle.block:
1430-
; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
1431-
; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1432-
; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1433-
; CHECK-SVE: scalar.ph:
1434-
;
1435-
; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
1436-
; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
1437-
; CHECK-SVE-MAXBW-NEXT: entry:
1438-
; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
1439-
; CHECK-SVE-MAXBW-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
1440-
; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
1441-
; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
1442-
; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
1443-
; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
1444-
; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
1445-
; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
1446-
; CHECK-SVE-MAXBW: vector.ph:
1447-
; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
1448-
; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
1449-
; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
1450-
; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
1451-
; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
1452-
; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0
1453-
; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
1454-
; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
1455-
; CHECK-SVE-MAXBW: vector.body:
1456-
; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
1457-
; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
1458-
; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
1459-
; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
1460-
; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
1461-
; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
1462-
; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
1463-
; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
1464-
; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
1465-
; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
1466-
; CHECK-SVE-MAXBW: middle.block:
1467-
; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]])
1468-
; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
1469-
; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
1470-
; CHECK-SVE-MAXBW: scalar.ph:
1471-
;
1472-
entry:
1473-
br label %loop
1474-
1475-
loop:
1476-
%ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
1477-
%red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
1478-
%l = load i8, ptr %ptr.iv, align 1
1479-
%l.ext = zext i8 %l to i32
1480-
%add = add i32 %red, %l.ext
1481-
%red.next = add i32 %add, %offset
1482-
%gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
1483-
%ec = icmp eq ptr %ptr.iv, %end
1484-
br i1 %ec, label %exit, label %loop
1485-
1486-
exit:
1487-
ret i32 %red.next
1488-
}
1489-
14901364
attributes #0 = { vscale_range(1,16) }
14911365

14921366

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,112 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 4
2+
; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S < %s | FileCheck %s --check-prefixes=CHECK-NEON
3+
4+
target triple = "arm64-apple-macosx"
5+
6+
define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
7+
; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
8+
; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
9+
; CHECK-NEON-NEXT: entry:
10+
; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
11+
; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
12+
; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
13+
; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
14+
; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
15+
; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
16+
; CHECK-NEON: vector.ph:
17+
; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
18+
; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
19+
; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
20+
; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
21+
; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
22+
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
23+
; CHECK-NEON: vector.body:
24+
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
25+
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
26+
; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
27+
; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
28+
; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
29+
; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
30+
; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
31+
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
32+
; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
33+
; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
34+
; CHECK-NEON: middle.block:
35+
; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
36+
; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
37+
; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
38+
; CHECK-NEON: scalar.ph:
39+
;
40+
entry:
41+
br label %loop
42+
43+
loop:
44+
%ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
45+
%red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
46+
%l = load i8, ptr %ptr.iv, align 1
47+
%l.ext = zext i8 %l to i32
48+
%add = add i32 %red, %l.ext
49+
%red.next = add i32 %add, %offset
50+
%gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
51+
%ec = icmp eq ptr %ptr.iv, %end
52+
br i1 %ec, label %exit, label %loop
53+
54+
exit:
55+
ret i32 %red.next
56+
}
57+
58+
59+
define i16 @test_incomplete_chain_without_mul(ptr noalias %dst, ptr %A, ptr %B) #0 {
60+
; CHECK-NEON-LABEL: define i16 @test_incomplete_chain_without_mul(
61+
; CHECK-NEON-SAME: ptr noalias [[DST:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] {
62+
; CHECK-NEON-NEXT: entry:
63+
; CHECK-NEON-NEXT: br label [[VECTOR_MEMCHECK:%.*]]
64+
; CHECK-NEON: vector.ph:
65+
; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
66+
; CHECK-NEON: vector.body:
67+
; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_MEMCHECK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
68+
; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i16> [ zeroinitializer, [[VECTOR_MEMCHECK]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
69+
; CHECK-NEON-NEXT: [[TMP0:%.*]] = load i8, ptr [[A]], align 1
70+
; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[TMP0]], i64 0
71+
; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
72+
; CHECK-NEON-NEXT: [[TMP1:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i16>
73+
; CHECK-NEON-NEXT: [[TMP2:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15
74+
; CHECK-NEON-NEXT: store i16 [[TMP2]], ptr [[DST]], align 2
75+
; CHECK-NEON-NEXT: [[TMP3:%.*]] = load i8, ptr [[B]], align 1
76+
; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <16 x i8> poison, i8 [[TMP3]], i64 0
77+
; CHECK-NEON-NEXT: [[BROADCAST_SPLAT7:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT6]], <16 x i8> poison, <16 x i32> zeroinitializer
78+
; CHECK-NEON-NEXT: [[TMP4:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT7]] to <16 x i16>
79+
; CHECK-NEON-NEXT: [[TMP5:%.*]] = add <16 x i16> [[VEC_PHI]], [[TMP4]]
80+
; CHECK-NEON-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP5]], [[TMP1]]
81+
; CHECK-NEON-NEXT: [[TMP7]] = add <16 x i16> [[TMP6]], [[TMP4]]
82+
; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
83+
; CHECK-NEON-NEXT: [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
84+
; CHECK-NEON-NEXT: br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
85+
; CHECK-NEON: middle.block:
86+
; CHECK-NEON-NEXT: [[TMP9:%.*]] = call i16 @llvm.vector.reduce.add.v16i16(<16 x i16> [[TMP7]])
87+
; CHECK-NEON-NEXT: br label [[SCALAR_PH:%.*]]
88+
; CHECK-NEON: scalar.ph:
89+
;
90+
entry:
91+
br label %loop
92+
93+
loop:
94+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
95+
%red = phi i16 [ 0, %entry ], [ %red.next, %loop ]
96+
%l.a = load i8, ptr %A, align 1
97+
%a.ext = zext i8 %l.a to i16
98+
store i16 %a.ext, ptr %dst, align 2
99+
%l.b = load i8, ptr %B, align 1
100+
%b.ext = zext i8 %l.b to i16
101+
%add = add i16 %red, %b.ext
102+
%add.1 = add i16 %add, %a.ext
103+
%red.next = add i16 %add.1, %b.ext
104+
%iv.next = add i64 %iv, 1
105+
%ec = icmp ult i64 %iv, 1024
106+
br i1 %ec, label %loop, label %exit
107+
108+
exit:
109+
ret i16 %red.next
110+
}
111+
112+
attributes #0 = { "target-cpu"="grace" }

0 commit comments

Comments
 (0)