Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 16 additions & 44 deletions llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6224,6 +6224,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
case Instruction::ICmp:
case Instruction::FCmp: {
Type *ValTy = I->getOperand(0)->getType();
InstructionCost Cost = 0;

if (canTruncateToMinimalBitwidth(I, VF)) {
[[maybe_unused]] Instruction *Op0AsInstruction =
Expand All @@ -6235,11 +6236,22 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I,
ValTy = IntegerType::get(ValTy->getContext(), MinBWs[I]);
}

// If the Cmp instruction has multiple uses in the loop, it
// will generate a scalar Cmp for latch and a vector Cmp for other uses.
if (I == TheLoop->getLatchCmpInst() && !I->hasOneUse())
Cost += TTI.getCmpSelInstrCost(I->getOpcode(), ValTy,
CmpInst::makeCmpResultType(ValTy),
cast<CmpInst>(I)->getPredicate(), CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, I);

VectorTy = toVectorTy(ValTy, VF);
return TTI.getCmpSelInstrCost(
I->getOpcode(), VectorTy, CmpInst::makeCmpResultType(VectorTy),
cast<CmpInst>(I)->getPredicate(), CostKind,
{TTI::OK_AnyValue, TTI::OP_None}, {TTI::OK_AnyValue, TTI::OP_None}, I);
return Cost + TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy,
CmpInst::makeCmpResultType(VectorTy),
cast<CmpInst>(I)->getPredicate(),
CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, I);
}
case Instruction::Store:
case Instruction::Load: {
Expand Down Expand Up @@ -6769,46 +6781,6 @@ LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
}
}

/// Compute the cost of all exiting conditions of the loop using the legacy
/// cost model. This is to match the legacy behavior, which adds the cost of
/// all exit conditions. Note that this over-estimates the cost, as there will
/// be a single condition to control the vector loop.
SmallVector<BasicBlock *> Exiting;
CM.TheLoop->getExitingBlocks(Exiting);
SetVector<Instruction *> ExitInstrs;
// Collect all exit conditions.
for (BasicBlock *EB : Exiting) {
auto *Term = dyn_cast<BranchInst>(EB->getTerminator());
if (!Term || CostCtx.skipCostComputation(Term, VF.isVector()))
continue;
if (auto *CondI = dyn_cast<Instruction>(Term->getOperand(0))) {
ExitInstrs.insert(CondI);
}
}
// Compute the cost of all instructions only feeding the exit conditions.
for (unsigned I = 0; I != ExitInstrs.size(); ++I) {
Instruction *CondI = ExitInstrs[I];
if (!OrigLoop->contains(CondI) ||
!CostCtx.SkipCostComputation.insert(CondI).second)
continue;
InstructionCost CondICost = CostCtx.getLegacyCost(CondI, VF);
LLVM_DEBUG({
dbgs() << "Cost of " << CondICost << " for VF " << VF
<< ": exit condition instruction " << *CondI << "\n";
});
Cost += CondICost;
for (Value *Op : CondI->operands()) {
auto *OpI = dyn_cast<Instruction>(Op);
if (!OpI || CostCtx.skipCostComputation(OpI, VF.isVector()) ||
any_of(OpI->users(), [&ExitInstrs, this](User *U) {
return OrigLoop->contains(cast<Instruction>(U)->getParent()) &&
!ExitInstrs.contains(cast<Instruction>(U));
}))
continue;
ExitInstrs.insert(OpI);
}
}

// Pre-compute the costs for branches except for the backedge, as the number
// of replicate regions in a VPlan may not directly match the number of
// branches, which would lead to different decisions.
Expand Down
51 changes: 51 additions & 0 deletions llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1104,6 +1104,36 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
return Ctx.TTI.getArithmeticReductionCost(
Instruction::Or, cast<VectorType>(VecTy), std::nullopt, Ctx.CostKind);
}
case VPInstruction::BranchOnCount: {
Type *ValTy = Ctx.Types.inferScalarType(getOperand(0));

// If the vector loop only executed once (VF == original trip count), ignore
// the cost of cmp.
// TODO: We can remove this after hoist `unrollByUF` and
// `optimizeForVFandUF` which will optimize BranchOnCount out.
auto TC = dyn_cast_if_present<ConstantInt>(
getParent()->getPlan()->getTripCount()->getUnderlyingValue());
if (TC && VF.isFixed() && TC->getZExtValue() == VF.getFixedValue())
return 0;

// BranchOnCount will generate icmp_eq + br instructions and the cost of
// branch will be calculated in VPRegionBlock.
return Ctx.TTI.getCmpSelInstrCost(Instruction::ICmp, ValTy, nullptr,
CmpInst::ICMP_EQ, Ctx.CostKind);
}
case VPInstruction::BranchOnCond: {
// BranchOnCond is free since the branch cost is already calculated by VPBB.
if (vputils::onlyFirstLaneUsed(getOperand(0)))
return 0;

// Otherwise, BranchOnCond will generate `extractelement` to extract the
// condition from vector type.
return Ctx.TTI.getVectorInstrCost(
Instruction::ExtractElement,
cast<VectorType>(
toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF)),
Ctx.CostKind, 0, nullptr, nullptr);
}
case VPInstruction::FirstActiveLane: {
Type *ScalarTy = Ctx.Types.inferScalarType(getOperand(0));
if (VF.isScalar())
Expand Down Expand Up @@ -1145,6 +1175,27 @@ InstructionCost VPInstruction::computeCost(ElementCount VF,
I32Ty, {Arg0Ty, I32Ty, I1Ty});
return Ctx.TTI.getIntrinsicInstrCost(Attrs, Ctx.CostKind);
}
case VPInstruction::Not: {
Type *RetTy = Ctx.Types.inferScalarType(getOperand(0));
if (!vputils::onlyFirstLaneUsed(this))
RetTy = toVectorTy(RetTy, VF);
return Ctx.TTI.getArithmeticInstrCost(Instruction::Xor, RetTy,
Ctx.CostKind);
}
case Instruction::ICmp:
case Instruction::FCmp: {
Instruction *CtxI = dyn_cast_or_null<Instruction>(getUnderlyingValue());
Type *SrcTy = Ctx.Types.inferScalarType(getOperand(0));
Type *RetTy = Ctx.Types.inferScalarType(this);
if (!vputils::onlyFirstLaneUsed(this)) {
SrcTy = toVectorTy(SrcTy, VF);
RetTy = toVectorTy(RetTy, VF);
}
return Ctx.TTI.getCmpSelInstrCost(Opcode, SrcTy, RetTy, getPredicate(),
Ctx.CostKind,
{TTI::OK_AnyValue, TTI::OP_None},
{TTI::OK_AnyValue, TTI::OP_None}, CtxI);
}
case VPInstruction::ExtractLastElement: {
// Add on the cost of extracting the element.
auto *VecTy = toVectorTy(Ctx.Types.inferScalarType(getOperand(0)), VF);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -534,25 +534,47 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
; DEFAULT-LABEL: define void @multiple_exit_conditions(
; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
; DEFAULT-NEXT: [[ENTRY:.*:]]
; DEFAULT-NEXT: br label %[[VECTOR_PH:.*]]
; DEFAULT-NEXT: [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP5]], 3
; DEFAULT-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP4]]
; DEFAULT-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
; DEFAULT: [[VECTOR_PH]]:
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
; DEFAULT-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
; DEFAULT-NEXT: [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
; DEFAULT-NEXT: [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
; DEFAULT-NEXT: [[TMP6:%.*]] = mul i64 [[N_VEC]], 8
; DEFAULT-NEXT: [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
; DEFAULT-NEXT: [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
; DEFAULT-NEXT: br label %[[VECTOR_BODY:.*]]
; DEFAULT: [[VECTOR_BODY]]:
; DEFAULT-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
; DEFAULT-NEXT: [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
; DEFAULT-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
; DEFAULT-NEXT: [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; DEFAULT-NEXT: [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
; DEFAULT-NEXT: store <8 x double> [[TMP3]], ptr [[NEXT_GEP]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
; DEFAULT-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
; DEFAULT-NEXT: br i1 [[TMP5]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; DEFAULT-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP1]], i64 0
; DEFAULT-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
; DEFAULT-NEXT: [[TMP20:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], splat (i16 1)
; DEFAULT-NEXT: [[TMP9:%.*]] = uitofp <vscale x 2 x i16> [[TMP20]] to <vscale x 2 x double>
; DEFAULT-NEXT: [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP11:%.*]] = shl nuw i64 [[TMP10]], 1
; DEFAULT-NEXT: [[TMP12:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP11]]
; DEFAULT-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP14:%.*]] = shl nuw i64 [[TMP13]], 2
; DEFAULT-NEXT: [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP14]]
; DEFAULT-NEXT: [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
; DEFAULT-NEXT: [[TMP17:%.*]] = mul nuw i64 [[TMP16]], 6
; DEFAULT-NEXT: [[TMP18:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP17]]
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[NEXT_GEP]], align 8
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP12]], align 8
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP15]], align 8
; DEFAULT-NEXT: store <vscale x 2 x double> [[TMP9]], ptr [[TMP18]], align 8
; DEFAULT-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
; DEFAULT-NEXT: [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
; DEFAULT-NEXT: br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
; DEFAULT: [[MIDDLE_BLOCK]]:
; DEFAULT-NEXT: br label %[[SCALAR_PH:.*]]
; DEFAULT-NEXT: [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
; DEFAULT-NEXT: br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
; DEFAULT: [[SCALAR_PH]]:
;
; PRED-LABEL: define void @multiple_exit_conditions(
Expand Down Expand Up @@ -660,16 +682,17 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
; COMMON: [[PRED_STORE_CONTINUE12]]:
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT1:.*]]
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[PRED_STORE_CONTINUE14:.*]]
; COMMON: [[PRED_STORE_IF13]]:
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
; COMMON-NEXT: br label %[[EXIT1]]
; COMMON: [[EXIT1]]:
; COMMON-NEXT: br label %[[SCALAR_PH1:.*]]
; COMMON: [[SCALAR_PH1]]:
; COMMON-NEXT: br [[EXIT:label %.*]]
; COMMON: [[SCALAR_PH:.*:]]
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE14]]
; COMMON: [[PRED_STORE_CONTINUE14]]:
; COMMON-NEXT: br label %[[MIDDLE_BLOCK:.*]]
; COMMON: [[MIDDLE_BLOCK]]:
; COMMON-NEXT: br label %[[EXIT:.*]]
; COMMON: [[EXIT]]:
; COMMON-NEXT: ret void
;
entry:
br label %loop
Expand Down
16 changes: 11 additions & 5 deletions llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,12 @@ define i64 @test(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'test'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 56
; CHECK: LV: Selecting VF: 16
entry:
Expand Down Expand Up @@ -43,12 +44,13 @@ define i64 @test_external_iv_user(ptr %a, ptr %b) #0 {
; CHECK-LABEL: LV: Checking a loop in 'test_external_iv_user'
; CHECK: Cost of 1 for VF 8: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<{{.+}}>, vp<{{.+}}>
; CHECK: Cost for VF 8: 30
; CHECK-NEXT: Cost of 1 for VF 16: induction instruction %i.iv.next = add nuw nsw i64 %i.iv, 1
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<{{.+}}>, vp<{{.+}}>
; CHECK: Cost for VF 16: 57
; CHECK: LV: Selecting VF: vscale x 2
entry:
Expand Down Expand Up @@ -80,12 +82,13 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: induction instruction %j.iv.next = add nuw nsw i64 %j.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 1 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %i.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 27
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 41
; CHECK: LV: Selecting VF: 16
entry:
Expand Down Expand Up @@ -116,11 +119,14 @@ define i1 @test_extra_cmp_user(ptr nocapture noundef %dst, ptr nocapture noundef
; CHECK-LABEL: LV: Checking a loop in 'test_extra_cmp_user'
; CHECK: Cost of 4 for VF 8: induction instruction %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
; CHECK-NEXT: Cost of 0 for VF 8: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: Cost of 4 for VF 8: exit condition instruction %exitcond.not = icmp eq i64 %indvars.iv.next, 16
; CHECK-NEXT: Cost of 0 for VF 8: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost for VF 8: 12
; CHECK: Cost of 4 for VF 8: WIDEN ir<%{{.+}}> = icmp eq ir<%{{.+}}>, ir<16>
; CHECK: Cost of 1 for VF 8: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 8: 13
; CHECK-NEXT: Cost of 0 for VF 16: induction instruction %indvars.iv = phi i64 [ 0, %entry ], [ %indvars.iv.next, %for.body ]
; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
; CHECK: Cost of 0 for VF 16: WIDEN ir<%{{.+}}> = icmp eq ir<%{{.+}}>, ir<16>
; CHECK: Cost of 0 for VF 16: EMIT branch-on-count vp<%{{.+}}>, vp<%{{.+}}>
; CHECK: Cost for VF 16: 4
; CHECK: LV: Selecting VF: 16
entry:
Expand Down
Loading
Loading