Skip to content

Commit b8f5de4

Browse files
committed
[LV][VPlan] Implement VPlan-based cost for exit condition.
This patch tried to model the cost of exit conditions through vplan-based cost model. * `BranchOnCount` will generate icmp + br. The branch instruction is already implemented by the VPRegionBlock so we only need to calculate the cost of icmp. If the VF is same as the trip count of the loop, the cost of the BranchOnCount is free. This patch is not quite NFC for following reasons. * Some of the BranchOnCount could be optimized to BranchOnCond, which is free. * Some of the instructions calculated in the exit condition in legacy cost model will be optimized out.
1 parent aaf5493 commit b8f5de4

File tree

13 files changed

+287
-278
lines changed

13 files changed

+287
-278
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 16 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -6224,6 +6224,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
62246224
case Instruction::ICmp:
62256225
case Instruction::FCmp: {
62266226
Type *ValTy = I->getOperand(0)->getType();
6227+
InstructionCost Cost = 0;
62276228

62286229
if (canTruncateToMinimalBitwidth(I, VF)) {
62296230
[[maybe_unused]] Instruction *Op0AsInstruction =
@@ -6235,11 +6236,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
62356236
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
62366237
}
62376238

6239+
// If the Cmp instruction has multiple uses in the loop, it
6240+
// will generate a scalar Cmp for latch and a vector Cmp for other uses.
6241+
if (I == TheLoop->getLatchCmpInst() && !I->hasOneUse())
6242+
Cost += TTI.getCmpSelInstrCost(I->getOpcode(), ValTy,
6243+
CmpInst::makeCmpResultType(ValTy),
6244+
cast<CmpInst>(I)->getPredicate(), CostKind,
6245+
{TTI::OK_AnyValue, TTI::OP_None},
6246+
{TTI::OK_AnyValue, TTI::OP_None}, I);
6247+
62386248
VectorTy = toVectorTy(ValTy, VF);
6239-
return TTI.getCmpSelInstrCost(
6240-
I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
6241-
cast<CmpInst>(I)->getPredicate(), CostKind,
6242-
{TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
6249+
return Cost + TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy,
6250+
CmpInst::makeCmpResultType(VectorTy),
6251+
cast<CmpInst>(I)->getPredicate(),
6252+
CostKind,
6253+
{TTI::OK_AnyValue, TTI::OP_None},
6254+
{TTI::OK_AnyValue, TTI::OP_None}, I);
62436255
}
62446256
case Instruction::Store:
62456257
case Instruction::Load: {
@@ -6769,46 +6781,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
67696781
}
67706782
}
67716783

6772-
/// Compute the cost of all exiting conditions of the loop using the legacy
6773-
/// cost model. This is to match the legacy behavior, which adds the cost of
6774-
/// all exit conditions. Note that this over-estimates the cost, as there will
6775-
/// be a single condition to control the vector loop.
6776-
SmallVector<BasicBlock *> Exiting;
6777-
CM.TheLoop->getExitingBlocks(Exiting);
6778-
SetVector<Instruction *> ExitInstrs;
6779-
// Collect all exit conditions.
6780-
for (BasicBlock *EB : Exiting) {
6781-
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
6782-
if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
6783-
continue;
6784-
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
6785-
ExitInstrs.insert(CondI);
6786-
}
6787-
}
6788-
// Compute the cost of all instructions only feeding the exit conditions.
6789-
for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
6790-
Instruction *CondI = ExitInstrs[I];
6791-
if (!OrigLoop->contains(CondI) ||
6792-
!CostCtx.SkipCostComputation.insert(CondI).second)
6793-
continue;
6794-
InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
6795-
LLVM_DEBUG({
6796-
dbgs() << "Cost of " << CondICost << " for VF " << VF
6797-
<< ": exit condition instruction " << *CondI << "\n";
6798-
});
6799-
Cost += CondICost;
6800-
for (Value *Op : CondI->operands()) {
6801-
auto *OpI = dyn_cast<Instruction>(Op);
6802-
if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
6803-
any_of(OpI->users(), [&ExitInstrs, this](User *U) {
6804-
return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
6805-
!ExitInstrs.contains(cast<Instruction>(U));
6806-
}))
6807-
continue;
6808-
ExitInstrs.insert(OpI);
6809-
}
6810-
}
6811-
68126784
// Pre-compute the costs for branches except for the backedge, as the number
68136785
// of replicate regions in a VPlan may not directly match the number of
68146786
// branches, which would lead to different decisions.

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1104,6 +1104,36 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
11041104
return Ctx.TTI.getArithmeticReductionCost(
11051105
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
11061106
}
1107+
case VPInstruction::BranchOnCount: {
1108+
Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));
1109+
1110+
// If the vector loop only executed once (VF == original trip count), ignore
1111+
// the cost of cmp.
1112+
// TODO: We can remove this after hoist `unrollByUF` and
1113+
// `optimizeForVFandUF` which will optimize BranchOnCount out.
1114+
auto TC = dyn_cast_if_present<ConstantInt>(
1115+
getParent()->getPlan()->getTripCount()->getUnderlyingValue());
1116+
if (TC && VF.isFixed() && TC->getZExtValue() == VF.getFixedValue())
1117+
return 0;
1118+
1119+
// BranchOnCount will generate icmp_eq + br instructions and the cost of
1120+
// branch will be calculated in VPRegionBlock.
1121+
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, nullptr,
1122+
CmpInst::ICMP_EQ, Ctx.CostKind);
1123+
}
1124+
case VPInstruction::BranchOnCond: {
1125+
// BranchOnCond is free since the branch cost is already calculated by VPBB.
1126+
if (vputils::onlyFirstLaneUsed(getOperand(0)))
1127+
return 0;
1128+
1129+
// Otherwise, BranchOnCond will generate `extractelement` to extract the
1130+
// condition from vector type.
1131+
return Ctx.TTI.getVectorInstrCost(
1132+
Instruction::ExtractElement,
1133+
cast<VectorType>(
1134+
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)),
1135+
Ctx.CostKind, 0, nullptr, nullptr);
1136+
}
11071137
case VPInstruction::FirstActiveLane: {
11081138
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
11091139
if (VF.isScalar())

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 66 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -534,25 +534,47 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
534534
; DEFAULT-LABEL: define void @multiple_exit_conditions(
535535
; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
536536
; DEFAULT-NEXT: [[ENTRY:.*:]]
537-
; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]]
537+
; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
538+
; DEFAULT-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP5]], 3
539+
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP4]]
540+
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
538541
; DEFAULT: [[VECTOR_PH]]:
539-
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
542+
; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
543+
; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
544+
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
545+
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
546+
; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
547+
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
548+
; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
540549
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
541550
; DEFAULT: [[VECTOR_BODY]]:
542551
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
543552
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
544553
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
545554
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
546-
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
547-
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
548-
; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
549-
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
550-
; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[NEXT_GEP]], align 8
551-
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
552-
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
553-
; DEFAULT-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
555+
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP1]], i64 0
556+
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
557+
; DEFAULT-NEXT: [[TMP20:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
558+
; DEFAULT-NEXT: [[TMP9:%.*]] = uitofp <vscale x 2 x i16> [[TMP20]] to <vscale x 2 x double>
559+
; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
560+
; DEFAULT-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
561+
; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP11]]
562+
; DEFAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
563+
; DEFAULT-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 2
564+
; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP14]]
565+
; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
566+
; DEFAULT-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 6
567+
; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP17]]
568+
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[NEXT_GEP]], align 8
569+
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP12]], align 8
570+
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP15]], align 8
571+
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP18]], align 8
572+
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
573+
; DEFAULT-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
574+
; DEFAULT-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
554575
; DEFAULT: [[MIDDLE_BLOCK]]:
555-
; DEFAULT-NEXT: br label %[[SCALAR_PH:.*]]
576+
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
577+
; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
556578
; DEFAULT: [[SCALAR_PH]]:
557579
;
558580
; PRED-LABEL: define void @multiple_exit_conditions(
@@ -660,16 +682,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
660682
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
661683
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
662684
; COMMON: [[PRED_STORE_CONTINUE12]]:
663-
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]]
685+
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
664686
; COMMON: [[PRED_STORE_IF13]]:
665687
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
666688
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
667-
; COMMON-NEXT: br label %[[EXIT1]]
689+
; COMMON-NEXT: br label %[[EXIT]]
690+
; COMMON: [[EXIT]]:
691+
; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
692+
; COMMON: [[SCALAR_PH]]:
693+
; COMMON-NEXT: br label %[[EXIT1:.*]]
668694
; COMMON: [[EXIT1]]:
669-
; COMMON-NEXT: br label %[[SCALAR_PH1:.*]]
670-
; COMMON: [[SCALAR_PH1]]:
671-
; COMMON-NEXT: br [[EXIT:label %.*]]
672-
; COMMON: [[SCALAR_PH:.*:]]
695+
; COMMON-NEXT: ret void
673696
;
674697
entry:
675698
br label %loop
@@ -1302,7 +1325,7 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
13021325
; PRED-NEXT: br label %[[VECTOR_MEMCHECK:.*]]
13031326
; PRED: [[VECTOR_MEMCHECK]]:
13041327
; PRED-NEXT: [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
1305-
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 16
1328+
; PRED-NEXT: [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 4
13061329
; PRED-NEXT: [[TMP3:%.*]] = sub i64 [[C1]], [[A2]]
13071330
; PRED-NEXT: [[DIFF_CHECK:%.*]] = icmp ult i64 [[TMP3]], [[TMP2]]
13081331
; PRED-NEXT: [[TMP4:%.*]] = sub i64 [[C1]], [[B3]]
@@ -1311,42 +1334,42 @@ define void @pred_udiv_select_cost(ptr %A, ptr %B, ptr %C, i64 %n, i8 %y) #1 {
13111334
; PRED-NEXT: br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
13121335
; PRED: [[VECTOR_PH]]:
13131336
; PRED-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
1314-
; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 16
1337+
; PRED-NEXT: [[TMP6:%.*]] = mul nuw i64 [[TMP5]], 4
13151338
; PRED-NEXT: [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
1316-
; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4
1339+
; PRED-NEXT: [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2
13171340
; PRED-NEXT: [[TMP9:%.*]] = sub i64 [[TMP0]], [[TMP8]]
13181341
; PRED-NEXT: [[TMP10:%.*]] = icmp ugt i64 [[TMP0]], [[TMP8]]
13191342
; PRED-NEXT: [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
1320-
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 [[TMP0]])
1321-
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[Y]], i64 0
1322-
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
1343+
; PRED-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 0, i64 [[TMP0]])
1344+
; PRED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i8> poison, i8 [[Y]], i64 0
1345+
; PRED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i8> poison, <vscale x 4 x i32> zeroinitializer
13231346
; PRED-NEXT: br label %[[VECTOR_BODY:.*]]
13241347
; PRED: [[VECTOR_BODY]]:
13251348
; PRED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
1326-
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
1349+
; PRED-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 4 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], %[[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], %[[VECTOR_BODY]] ]
13271350
; PRED-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
1328-
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP12]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
1329-
; PRED-NEXT: [[TMP13:%.*]] = uitofp <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x float>
1351+
; PRED-NEXT: [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP12]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
1352+
; PRED-NEXT: [[TMP13:%.*]] = uitofp <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x float>
13301353
; PRED-NEXT: [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
1331-
; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 16 x i8> @llvm.masked.load.nxv16i8.p0(ptr [[TMP14]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i8> poison)
1332-
; PRED-NEXT: [[TMP15:%.*]] = icmp ne <vscale x 16 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
1333-
; PRED-NEXT: [[TMP16:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i1> zeroinitializer
1334-
; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 16 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
1335-
; PRED-NEXT: [[TMP18:%.*]] = select <vscale x 16 x i1> [[TMP16]], <vscale x 16 x i8> [[BROADCAST_SPLAT]], <vscale x 16 x i8> splat (i8 1)
1336-
; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 16 x i8> [[TMP17]], [[TMP18]]
1337-
; PRED-NEXT: [[TMP20:%.*]] = icmp ugt <vscale x 16 x i8> [[TMP19]], splat (i8 1)
1338-
; PRED-NEXT: [[TMP21:%.*]] = select <vscale x 16 x i1> [[TMP20]], <vscale x 16 x i32> zeroinitializer, <vscale x 16 x i32> splat (i32 255)
1339-
; PRED-NEXT: [[PREDPHI:%.*]] = select <vscale x 16 x i1> [[TMP15]], <vscale x 16 x i32> [[TMP21]], <vscale x 16 x i32> zeroinitializer
1340-
; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 16 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 16 x i32>
1341-
; PRED-NEXT: [[TMP23:%.*]] = sub <vscale x 16 x i32> [[PREDPHI]], [[TMP22]]
1342-
; PRED-NEXT: [[TMP24:%.*]] = sitofp <vscale x 16 x i32> [[TMP23]] to <vscale x 16 x float>
1343-
; PRED-NEXT: [[TMP25:%.*]] = call <vscale x 16 x float> @llvm.fmuladd.nxv16f32(<vscale x 16 x float> [[TMP24]], <vscale x 16 x float> splat (float 3.000000e+00), <vscale x 16 x float> [[TMP13]])
1344-
; PRED-NEXT: [[TMP26:%.*]] = fptoui <vscale x 16 x float> [[TMP25]] to <vscale x 16 x i8>
1354+
; PRED-NEXT: [[WIDE_MASKED_LOAD5:%.*]] = call <vscale x 4 x i8> @llvm.masked.load.nxv4i8.p0(ptr [[TMP14]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i8> poison)
1355+
; PRED-NEXT: [[TMP15:%.*]] = icmp ne <vscale x 4 x i8> [[WIDE_MASKED_LOAD5]], zeroinitializer
1356+
; PRED-NEXT: [[TMP16:%.*]] = select <vscale x 4 x i1> [[ACTIVE_LANE_MASK]], <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i1> zeroinitializer
1357+
; PRED-NEXT: [[TMP17:%.*]] = xor <vscale x 4 x i8> [[WIDE_MASKED_LOAD]], splat (i8 1)
1358+
; PRED-NEXT: [[TMP18:%.*]] = select <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i8> [[BROADCAST_SPLAT]], <vscale x 4 x i8> splat (i8 1)
1359+
; PRED-NEXT: [[TMP19:%.*]] = udiv <vscale x 4 x i8> [[TMP17]], [[TMP18]]
1360+
; PRED-NEXT: [[TMP20:%.*]] = icmp ugt <vscale x 4 x i8> [[TMP19]], splat (i8 1)
1361+
; PRED-NEXT: [[TMP21:%.*]] = select <vscale x 4 x i1> [[TMP20]], <vscale x 4 x i32> zeroinitializer, <vscale x 4 x i32> splat (i32 255)
1362+
; PRED-NEXT: [[PREDPHI:%.*]] = select <vscale x 4 x i1> [[TMP15]], <vscale x 4 x i32> [[TMP21]], <vscale x 4 x i32> zeroinitializer
1363+
; PRED-NEXT: [[TMP22:%.*]] = zext <vscale x 4 x i8> [[WIDE_MASKED_LOAD]] to <vscale x 4 x i32>
1364+
; PRED-NEXT: [[TMP23:%.*]] = sub <vscale x 4 x i32> [[PREDPHI]], [[TMP22]]
1365+
; PRED-NEXT: [[TMP24:%.*]] = sitofp <vscale x 4 x i32> [[TMP23]] to <vscale x 4 x float>
1366+
; PRED-NEXT: [[TMP25:%.*]] = call <vscale x 4 x float> @llvm.fmuladd.nxv4f32(<vscale x 4 x float> [[TMP24]], <vscale x 4 x float> splat (float 3.000000e+00), <vscale x 4 x float> [[TMP13]])
1367+
; PRED-NEXT: [[TMP26:%.*]] = fptoui <vscale x 4 x float> [[TMP25]] to <vscale x 4 x i8>
13451368
; PRED-NEXT: [[TMP27:%.*]] = getelementptr i8, ptr [[C]], i64 [[INDEX]]
1346-
; PRED-NEXT: call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> [[TMP26]], ptr [[TMP27]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
1369+
; PRED-NEXT: call void @llvm.masked.store.nxv4i8.p0(<vscale x 4 x i8> [[TMP26]], ptr [[TMP27]], i32 1, <vscale x 4 x i1> [[ACTIVE_LANE_MASK]])
13471370
; PRED-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
1348-
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
1349-
; PRED-NEXT: [[TMP28:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
1371+
; PRED-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP11]])
1372+
; PRED-NEXT: [[TMP28:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
13501373
; PRED-NEXT: [[TMP29:%.*]] = xor i1 [[TMP28]], true
13511374
; PRED-NEXT: br i1 [[TMP29]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
13521375
; PRED: [[MIDDLE_BLOCK]]:

llvm/test/Transforms/LoopVectorize/AArch64/early_exit_costs.ll

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -92,11 +92,11 @@ loop.end:
9292

9393
define i64 @vectorization_not_profitable_due_to_trunc(ptr dereferenceable(800) %src) {
9494
; CHECK-LABEL: LV: Checking a loop in 'vectorization_not_profitable_due_to_trunc'
95-
; CHECK: LV: Selecting VF: 1.
96-
; CHECK-NEXT: Calculating cost of work in exit block vector.early.exit:
97-
; CHECK-NEXT: Cost of 1 for VF 1: EMIT vp<%first.active.lane> = first-active-lane ir<%t>
98-
; CHECK-NEXT: Cost of 0 for VF 1: EMIT vp<%early.exit.value> = extract-lane vp<%first.active.lane>, ir<%l>
99-
; CHECK-NEXT: LV: Vectorization is possible but not beneficial.
95+
; CHECK: Calculating cost of work in exit block vector.early.exit:
96+
; CHECK-NEXT: Cost of 6 for VF 2: EMIT vp<{{.*}}> = first-active-lane ir<{{.*}}>
97+
; CHECK-NEXT: Cost of 2 for VF 2: EMIT vp<{{.*}}> = extract-lane vp<{{.*}}>, ir<{{.*}}>
98+
; CHECK: LV: Minimum required TC for runtime checks to be profitable:28
99+
; CHECK: LV: Found a vectorizable loop (2)
100100
entry:
101101
br label %loop.header
102102

0 commit comments

Comments
 (0)