Skip to content

Commit 1accce4

Browse files
committed
[VPlan] Introduce CSE pass
1 parent 4cf7702 commit 1accce4

30 files changed

+195
-166
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7225,6 +7225,7 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
72257225
VPlanTransforms::narrowInterleaveGroups(
72267226
BestVPlan, BestVF,
72277227
TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
7228+
VPlanTransforms::cse(BestVPlan);
72287229
VPlanTransforms::removeDeadRecipes(BestVPlan);
72297230

72307231
VPlanTransforms::convertToConcreteRecipes(BestVPlan);

llvm/lib/Transforms/Vectorize/VPlan.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -904,6 +904,11 @@ struct VPRecipeWithIRFlags : public VPSingleDefRecipe, public VPIRFlags {
904904
return R && classof(R);
905905
}
906906

907+
static inline bool classof(const VPSingleDefRecipe *U) {
908+
auto *R = dyn_cast<VPRecipeBase>(U);
909+
return R && classof(R);
910+
}
911+
907912
void execute(VPTransformState &State) override = 0;
908913

909914
/// Compute the cost for this recipe for \p VF, using \p Opcode and \p Ctx.

llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp

Lines changed: 102 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1800,6 +1800,108 @@ void VPlanTransforms::clearReductionWrapFlags(VPlan &Plan) {
18001800
}
18011801
}
18021802

1803+
namespace {
1804+
struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
1805+
static bool isSentinel(const VPSingleDefRecipe *Def) {
1806+
return Def == getEmptyKey() || Def == getTombstoneKey();
1807+
}
1808+
1809+
/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
1810+
/// Returns an optional pair, where the first element indicates whether it is
1811+
/// an intrinsic ID.
1812+
static std::optional<std::pair<bool, unsigned>>
1813+
getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
1814+
return TypeSwitch<const VPSingleDefRecipe *,
1815+
std::optional<std::pair<bool, unsigned>>>(R)
1816+
.Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
1817+
VPWidenSelectRecipe, VPReplicateRecipe>(
1818+
[](auto *I) { return std::make_pair(false, I->getOpcode()); })
1819+
.Case<VPWidenIntrinsicRecipe>([](auto *I) {
1820+
return std::make_pair(true, I->getVectorIntrinsicID());
1821+
})
1822+
.Default([](auto *) { return std::nullopt; });
1823+
}
1824+
1825+
/// During CSE, we can only handle certain recipes that don't read from
1826+
/// memory: if they read from memory, there could be an intervening write to
1827+
/// memory before the next instance is CSE'd, leading to an incorrect result.
1828+
/// We can extend the list of handled recipes in the future, provided we
1829+
/// account for the data embedded in them while checking for equality or
1830+
/// hashing.
1831+
static bool canHandle(const VPSingleDefRecipe *Def) {
1832+
auto C = getOpcodeOrIntrinsicID(Def);
1833+
1834+
// The issue with (Insert|Extract)Value is that the index of the
1835+
// insert/extract is not a proper operand in LLVM IR, and hence also not in
1836+
// VPlan.
1837+
if (!C || (!C->first && (C->second == Instruction::InsertValue ||
1838+
C->second == Instruction::ExtractValue)))
1839+
return false;
1840+
return !Def->mayReadFromMemory();
1841+
}
1842+
1843+
/// Hash the underlying data of \p Def.
1844+
static unsigned getHashValue(const VPSingleDefRecipe *Def) {
1845+
const VPlan *Plan = Def->getParent()->getPlan();
1846+
VPTypeAnalysis TypeInfo(*Plan);
1847+
hash_code Result = hash_combine(
1848+
Def->getVPDefID(), getOpcodeOrIntrinsicID(Def),
1849+
TypeInfo.inferScalarType(Def), vputils::isSingleScalar(Def),
1850+
hash_combine_range(Def->operands()));
1851+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(Def))
1852+
if (RFlags->hasPredicate())
1853+
return hash_combine(Result, RFlags->getPredicate());
1854+
return Result;
1855+
}
1856+
1857+
/// Check equality of underlying data of \p L and \p R.
1858+
static bool isEqual(const VPSingleDefRecipe *L, const VPSingleDefRecipe *R) {
1859+
if (isSentinel(L) || isSentinel(R))
1860+
return L == R;
1861+
if (L->getVPDefID() != R->getVPDefID() ||
1862+
getOpcodeOrIntrinsicID(L) != getOpcodeOrIntrinsicID(R) ||
1863+
vputils::isSingleScalar(L) != vputils::isSingleScalar(R) ||
1864+
!equal(L->operands(), R->operands()))
1865+
return false;
1866+
if (auto *LFlags = dyn_cast<VPRecipeWithIRFlags>(L))
1867+
if (LFlags->hasPredicate() &&
1868+
LFlags->getPredicate() !=
1869+
cast<VPRecipeWithIRFlags>(R)->getPredicate())
1870+
return false;
1871+
const VPlan *Plan = L->getParent()->getPlan();
1872+
VPTypeAnalysis TypeInfo(*Plan);
1873+
return TypeInfo.inferScalarType(L) == TypeInfo.inferScalarType(R);
1874+
}
1875+
};
1876+
} // end anonymous namespace
1877+
1878+
/// Perform a common-subexpression-elimination of VPSingleDefRecipes on the \p
1879+
/// Plan.
1880+
void VPlanTransforms::cse(VPlan &Plan) {
1881+
VPDominatorTree VPDT(Plan);
1882+
DenseMap<VPSingleDefRecipe *, VPSingleDefRecipe *, VPCSEDenseMapInfo> CSEMap;
1883+
1884+
for (VPBasicBlock *VPBB : VPBlockUtils::blocksOnly<VPBasicBlock>(
1885+
vp_depth_first_deep(Plan.getEntry()))) {
1886+
for (VPRecipeBase &R : *VPBB) {
1887+
auto *Def = dyn_cast<VPSingleDefRecipe>(&R);
1888+
if (!Def || !VPCSEDenseMapInfo::canHandle(Def))
1889+
continue;
1890+
if (VPSingleDefRecipe *V = CSEMap.lookup(Def)) {
1891+
// V must dominate Def for a valid replacement.
1892+
if (!VPDT.dominates(V->getParent(), VPBB))
1893+
continue;
1894+
// Drop poison-generating flags when reusing a value.
1895+
if (auto *RFlags = dyn_cast<VPRecipeWithIRFlags>(V))
1896+
RFlags->dropPoisonGeneratingFlags();
1897+
Def->replaceAllUsesWith(V);
1898+
continue;
1899+
}
1900+
CSEMap[Def] = Def;
1901+
}
1902+
}
1903+
}
1904+
18031905
/// Move loop-invariant recipes out of the vector loop region in \p Plan.
18041906
static void licm(VPlan &Plan) {
18051907
VPBasicBlock *Preheader = Plan.getVectorPreheader();

llvm/lib/Transforms/Vectorize/VPlanTransforms.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,6 +286,9 @@ struct VPlanTransforms {
286286
/// removing dead edges to their successors.
287287
static void removeBranchOnConst(VPlan &Plan);
288288

289+
/// Perform common-subexpression-elimination.
290+
static void cse(VPlan &Plan);
291+
289292
/// If there's a single exit block, optimize its phi recipes that use exiting
290293
/// IV values by feeding them precomputed end values instead, possibly taken
291294
/// one step backwards.

llvm/test/Transforms/LoopVectorize/AArch64/drop-poison-generating-flags.ll

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -54,15 +54,11 @@ define void @check_widen_intrinsic_with_nnan(ptr noalias %dst.0, ptr noalias %ds
5454
; CHECK-NEXT: br label %[[PRED_LOAD_CONTINUE6]]
5555
; CHECK: [[PRED_LOAD_CONTINUE6]]:
5656
; CHECK-NEXT: [[TMP24:%.*]] = phi <4 x double> [ [[TMP20]], %[[PRED_LOAD_CONTINUE4]] ], [ [[TMP23]], %[[PRED_LOAD_IF5]] ]
57-
; CHECK-NEXT: [[TMP25:%.*]] = add i64 [[INDEX]], -1
58-
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr double, ptr [[DST_0]], i64 [[TMP25]]
59-
; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP26]], i32 8, <4 x i1> [[TMP4]])
57+
; CHECK-NEXT: call void @llvm.masked.store.v4f64.p0(<4 x double> zeroinitializer, ptr [[TMP7]], i32 8, <4 x i1> [[TMP4]])
6058
; CHECK-NEXT: [[TMP28:%.*]] = fcmp oeq <4 x double> [[TMP24]], zeroinitializer
6159
; CHECK-NEXT: [[TMP29:%.*]] = select <4 x i1> [[TMP4]], <4 x i1> [[TMP28]], <4 x i1> zeroinitializer
6260
; CHECK-NEXT: [[TMP30:%.*]] = or <4 x i1> [[TMP5]], [[TMP29]]
63-
; CHECK-NEXT: [[TMP31:%.*]] = extractelement <4 x i1> [[TMP29]], i32 0
64-
; CHECK-NEXT: [[PREDPHI:%.*]] = select i1 [[TMP31]], i64 [[TMP25]], i64 [[TMP6]]
65-
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[PREDPHI]]
61+
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr i32, ptr [[DST_1]], i64 [[TMP6]]
6662
; CHECK-NEXT: call void @llvm.masked.store.v4i32.p0(<4 x i32> splat (i32 10), ptr [[TMP32]], i32 4, <4 x i1> [[TMP30]])
6763
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
6864
; CHECK-NEXT: [[TMP34:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1000

llvm/test/Transforms/LoopVectorize/AArch64/epilog-vectorization-widen-inductions.ll

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -330,11 +330,10 @@ define void @test_widen_induction_step_2(ptr %A, i64 %N, i32 %step) {
330330
; CHECK-NEXT: [[CMP_N11:%.*]] = icmp eq i64 [[N]], [[IND_END]]
331331
; CHECK-NEXT: br i1 [[CMP_N11]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
332332
; CHECK: vec.epilog.scalar.ph:
333-
; CHECK-NEXT: [[BC_RESUME_VAL5:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
334-
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK]] ]
333+
; CHECK-NEXT: [[BC_RESUME_VAL6:%.*]] = phi i64 [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[IND_END4]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
335334
; CHECK-NEXT: br label [[LOOP:%.*]]
336335
; CHECK: loop:
337-
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL5]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
336+
; CHECK-NEXT: [[IV_1:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT:%.*]], [[LOOP]] ]
338337
; CHECK-NEXT: [[IV_2:%.*]] = phi i64 [ [[BC_RESUME_VAL6]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_1_NEXT]], [[LOOP]] ]
339338
; CHECK-NEXT: [[GEP_A:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV_1]]
340339
; CHECK-NEXT: [[ADD:%.*]] = add i64 [[IV_2]], 10

llvm/test/Transforms/LoopVectorize/AArch64/force-target-instruction-cost.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -377,9 +377,8 @@ define void @invalid_legacy_cost(i64 %N, ptr %x) #0 {
377377
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
378378
; CHECK: [[VECTOR_BODY]]:
379379
; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
380-
; CHECK-NEXT: [[TMP5:%.*]] = alloca i8, i64 0, align 16
381380
; CHECK-NEXT: [[TMP6:%.*]] = alloca i8, i64 0, align 16
382-
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP5]], i32 0
381+
; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x ptr> poison, ptr [[TMP6]], i32 0
383382
; CHECK-NEXT: [[TMP8:%.*]] = insertelement <2 x ptr> [[TMP7]], ptr [[TMP6]], i32 1
384383
; CHECK-NEXT: [[TMP9:%.*]] = getelementptr ptr, ptr [[X]], i64 [[INDEX]]
385384
; CHECK-NEXT: store <2 x ptr> [[TMP8]], ptr [[TMP9]], align 8

llvm/test/Transforms/LoopVectorize/AArch64/induction-costs-sve.ll

Lines changed: 3 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -44,10 +44,8 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
4444
; DEFAULT-NEXT: [[TMP27:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
4545
; DEFAULT-NEXT: [[TMP28:%.*]] = mul <vscale x 8 x i16> [[TMP26]], [[TMP13]]
4646
; DEFAULT-NEXT: [[TMP29:%.*]] = mul <vscale x 8 x i16> [[TMP27]], [[TMP13]]
47-
; DEFAULT-NEXT: [[TMP30:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
48-
; DEFAULT-NEXT: [[TMP31:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i16>
49-
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP30]]
50-
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP31]]
47+
; DEFAULT-NEXT: [[TMP32:%.*]] = or <vscale x 8 x i16> [[TMP28]], [[TMP26]]
48+
; DEFAULT-NEXT: [[TMP33:%.*]] = or <vscale x 8 x i16> [[TMP29]], [[TMP27]]
5149
; DEFAULT-NEXT: [[TMP34:%.*]] = lshr <vscale x 8 x i16> [[TMP32]], splat (i16 1)
5250
; DEFAULT-NEXT: [[TMP35:%.*]] = lshr <vscale x 8 x i16> [[TMP33]], splat (i16 1)
5351
; DEFAULT-NEXT: [[TMP36:%.*]] = trunc <vscale x 8 x i16> [[TMP34]] to <vscale x 8 x i8>
@@ -118,8 +116,7 @@ define void @iv_casts(ptr %dst, ptr %src, i32 %x, i64 %N) #0 {
118116
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP18]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
119117
; PRED-NEXT: [[TMP17:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
120118
; PRED-NEXT: [[TMP22:%.*]] = mul <vscale x 16 x i16> [[TMP17]], [[TMP16]]
121-
; PRED-NEXT: [[TMP24:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i16>
122-
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP24]]
119+
; PRED-NEXT: [[TMP20:%.*]] = or <vscale x 16 x i16> [[TMP22]], [[TMP17]]
123120
; PRED-NEXT: [[TMP21:%.*]] = lshr <vscale x 16 x i16> [[TMP20]], splat (i16 1)
124121
; PRED-NEXT: [[TMP23:%.*]] = trunc <vscale x 16 x i16> [[TMP21]] to <vscale x 16 x i8>
125122
; PRED-NEXT: [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]

llvm/test/Transforms/LoopVectorize/AArch64/interleave-with-gaps.ll

Lines changed: 4 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
1919
; CHECK: [[VECTOR_PH]]:
2020
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
2121
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
22-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2322
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
2423
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
2524
; CHECK: [[VECTOR_BODY]]:
@@ -29,7 +28,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
2928
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
3029
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
3130
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
32-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
31+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
3332
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
3433
; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
3534
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP8]], align 1
@@ -58,7 +57,6 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
5857
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
5958
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
6059
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
61-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6260
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
6361
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
6462
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -75,7 +73,7 @@ define i64 @vector_loop_with_remaining_iterations(ptr %src, ptr noalias %dst, i3
7573
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
7674
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
7775
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
78-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
76+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
7977
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
8078
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
8179
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1
@@ -154,7 +152,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
154152
; CHECK: [[VECTOR_PH]]:
155153
; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[X]], i64 0
156154
; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
157-
; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
158155
; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i32> @llvm.abs.v16i32(<16 x i32> [[BROADCAST_SPLAT]], i1 false)
159156
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
160157
; CHECK: [[VECTOR_BODY]]:
@@ -164,7 +161,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
164161
; CHECK-NEXT: [[WIDE_VEC:%.*]] = load <64 x i8>, ptr [[TMP4]], align 1
165162
; CHECK-NEXT: [[STRIDED_VEC:%.*]] = shufflevector <64 x i8> [[WIDE_VEC]], <64 x i8> poison, <16 x i32> <i32 0, i32 4, i32 8, i32 12, i32 16, i32 20, i32 24, i32 28, i32 32, i32 36, i32 40, i32 44, i32 48, i32 52, i32 56, i32 60>
166163
; CHECK-NEXT: [[TMP5:%.*]] = zext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
167-
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP2]], <16 x i32> [[TMP5]])
164+
; CHECK-NEXT: [[TMP6:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP5]])
168165
; CHECK-NEXT: [[TMP7:%.*]] = call <16 x i32> @llvm.umin.v16i32(<16 x i32> [[TMP3]], <16 x i32> [[TMP6]])
169166
; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]]
170167
; CHECK-NEXT: store <16 x i8> zeroinitializer, ptr [[TMP26]], align 1
@@ -193,7 +190,6 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
193190
; CHECK-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 2 x i32> poison, i32 [[X]], i64 0
194191
; CHECK-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 2 x i32> [[BROADCAST_SPLATINSERT1]], <vscale x 2 x i32> poison, <vscale x 2 x i32> zeroinitializer
195192
; CHECK-NEXT: [[TMP22:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[BC_MERGE_RDX]], i32 0
196-
; CHECK-NEXT: [[TMP23:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
197193
; CHECK-NEXT: [[TMP24:%.*]] = call <vscale x 2 x i32> @llvm.abs.nxv2i32(<vscale x 2 x i32> [[BROADCAST_SPLAT2]], i1 false)
198194
; CHECK-NEXT: [[TMP25:%.*]] = call <vscale x 2 x i64> @llvm.stepvector.nxv2i64()
199195
; CHECK-NEXT: [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[BC_RESUME_VAL]], i64 0
@@ -210,7 +206,7 @@ define i64 @main_vector_loop_fixed_with_no_remaining_iterations(ptr %src, ptr no
210206
; CHECK-NEXT: [[TMP28:%.*]] = getelementptr { [4 x i8] }, ptr [[SRC]], <vscale x 2 x i64> [[VEC_IND]], i32 0, i64 3
211207
; CHECK-NEXT: [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 2 x i8> @llvm.masked.gather.nxv2i8.nxv2p0(<vscale x 2 x ptr> [[TMP28]], i32 1, <vscale x 2 x i1> splat (i1 true), <vscale x 2 x i8> poison)
212208
; CHECK-NEXT: [[TMP29:%.*]] = zext <vscale x 2 x i8> [[WIDE_MASKED_GATHER]] to <vscale x 2 x i32>
213-
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP23]], <vscale x 2 x i32> [[TMP29]])
209+
; CHECK-NEXT: [[TMP30:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP29]])
214210
; CHECK-NEXT: [[TMP31:%.*]] = call <vscale x 2 x i32> @llvm.umin.nxv2i32(<vscale x 2 x i32> [[TMP24]], <vscale x 2 x i32> [[TMP30]])
215211
; CHECK-NEXT: [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX7]]
216212
; CHECK-NEXT: store <vscale x 2 x i8> zeroinitializer, ptr [[TMP32]], align 1

llvm/test/Transforms/LoopVectorize/AArch64/licm-calls.ll

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,8 @@ define void @licm_replicate_call(double %x, ptr %dst) {
99
; CHECK-NEXT: [[ENTRY:.*]]:
1010
; CHECK-NEXT: br i1 false, label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
1111
; CHECK: [[VECTOR_PH]]:
12-
; CHECK-NEXT: [[TMP0:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
1312
; CHECK-NEXT: [[TMP1:%.*]] = tail call double @llvm.pow.f64(double [[X]], double 3.000000e+00)
14-
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP0]], i32 0
13+
; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x double> poison, double [[TMP1]], i32 0
1514
; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[TMP2]], double [[TMP1]], i32 1
1615
; CHECK-NEXT: br label %[[VECTOR_BODY:.*]]
1716
; CHECK: [[VECTOR_BODY]]:

0 commit comments

Comments
 (0)