Skip to content

Commit e39fef4

Browse files
committed
[VPlan] Don't apply predication discount to non-originally-predicated blocks
Split off from llvm#158690. Currently if an instruction needs predicated due to tail folding, it will also have a predicated discount applied to it in multiple places. This is likely inaccurate because we can expect a tail folded instruction to be executed on every iteration bar the last. This fixes it by checking if the instruction/block was originally predicated, and in doing so prevents vectorization with tail folding where we would have had to scalarize the memory op anyway.
1 parent 4f33d7b commit e39fef4

File tree

9 files changed

+78
-558
lines changed

9 files changed

+78
-558
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1249,6 +1249,25 @@ class LoopVectorizationCostModel {
12491249
/// Superset of instructions that return true for isScalarWithPredication.
12501250
bool isPredicatedInst(Instruction *I) const;
12511251

1252+
/// A helper function that returns how much we should divide the cost of a
1253+
/// predicated block by. Typically this is the reciprocal of the block
1254+
/// probability, i.e. if we return X we are assuming the predicated block will
1255+
/// execute once for every X iterations of the loop header so the block should
1256+
/// only contribute 1/X of its cost to the total cost calculation, but when
1257+
/// optimizing for code size it will just be 1 as code size costs don't depend
1258+
/// on execution probabilities.
1259+
///
1260+
/// TODO: We should use actual block probability here, if available.
1261+
/// Currently, we always assume predicated blocks have a 50% chance of
1262+
/// executing.
1263+
inline unsigned
1264+
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
1265+
BasicBlock *BB) const {
1266+
if (!Legal->blockNeedsPredication(BB))
1267+
return 1;
1268+
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
1269+
}
1270+
12521271
/// Return the costs for our two available strategies for lowering a
12531272
/// div/rem operation which requires speculating at least one lane.
12541273
/// First result is for scalarization (will be invalid for scalable
@@ -2902,7 +2921,8 @@ LoopVectorizationCostModel::getDivRemSpeculationCost(Instruction *I,
29022921
// Scale the cost by the probability of executing the predicated blocks.
29032922
// This assumes the predicated block for each vector lane is equally
29042923
// likely.
2905-
ScalarizationCost = ScalarizationCost / getPredBlockCostDivisor(CostKind);
2924+
ScalarizationCost =
2925+
ScalarizationCost / getPredBlockCostDivisor(CostKind, I->getParent());
29062926
}
29072927

29082928
InstructionCost SafeDivisorCost = 0;
@@ -5035,7 +5055,7 @@ InstructionCost LoopVectorizationCostModel::computePredInstDiscount(
50355055
}
50365056

50375057
// Scale the total scalar cost by block probability.
5038-
ScalarCost /= getPredBlockCostDivisor(CostKind);
5058+
ScalarCost /= getPredBlockCostDivisor(CostKind, I->getParent());
50395059

50405060
// Compute the discount. A non-negative discount means the vector version
50415061
// of the instruction costs more, and scalarizing would be beneficial.
@@ -5088,7 +5108,7 @@ InstructionCost LoopVectorizationCostModel::expectedCost(ElementCount VF) {
50885108
// cost by the probability of executing it. blockNeedsPredication from
50895109
// Legal is used so as to not include all blocks in tail folded loops.
50905110
if (VF.isScalar() && Legal->blockNeedsPredication(BB))
5091-
BlockCost /= getPredBlockCostDivisor(CostKind);
5111+
BlockCost /= getPredBlockCostDivisor(CostKind, BB);
50925112

50935113
Cost += BlockCost;
50945114
}
@@ -5167,7 +5187,7 @@ LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
51675187
// conditional branches, but may not be executed for each vector lane. Scale
51685188
// the cost by the probability of executing the predicated block.
51695189
if (isPredicatedInst(I)) {
5170-
Cost /= getPredBlockCostDivisor(CostKind);
5190+
Cost /= getPredBlockCostDivisor(CostKind, I->getParent());
51715191

51725192
// Add the cost of an i1 extract and a branch
51735193
auto *VecI1Ty =
@@ -6727,6 +6747,11 @@ bool VPCostContext::skipCostComputation(Instruction *UI, bool IsVector) const {
67276747
SkipCostComputation.contains(UI);
67286748
}
67296749

6750+
unsigned VPCostContext::getPredBlockCostDivisor(
6751+
TargetTransformInfo::TargetCostKind CostKind, BasicBlock *BB) const {
6752+
return CM.getPredBlockCostDivisor(CostKind, BB);
6753+
}
6754+
67306755
InstructionCost
67316756
LoopVectorizationPlanner::precomputeCosts(VPlan &Plan, ElementCount VF,
67326757
VPCostContext &CostCtx) const {

llvm/lib/Transforms/Vectorize/VPlan.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -855,7 +855,9 @@ InstructionCost VPRegionBlock::cost(ElementCount VF, VPCostContext &Ctx) {
855855
// For the scalar case, we may not always execute the original predicated
856856
// block, Thus, scale the block's cost by the probability of executing it.
857857
if (VF.isScalar())
858-
return ThenCost / getPredBlockCostDivisor(Ctx.CostKind);
858+
if (auto *VPIRBB = dyn_cast<VPIRBasicBlock>(Then))
859+
return ThenCost / Ctx.getPredBlockCostDivisor(Ctx.CostKind,
860+
VPIRBB->getIRBasicBlock());
859861

860862
return ThenCost;
861863
}

llvm/lib/Transforms/Vectorize/VPlanHelpers.h

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,21 +50,6 @@ Value *getRuntimeVF(IRBuilderBase &B, Type *Ty, ElementCount VF);
5050
Value *createStepForVF(IRBuilderBase &B, Type *Ty, ElementCount VF,
5151
int64_t Step);
5252

53-
/// A helper function that returns how much we should divide the cost of a
54-
/// predicated block by. Typically this is the reciprocal of the block
55-
/// probability, i.e. if we return X we are assuming the predicated block will
56-
/// execute once for every X iterations of the loop header so the block should
57-
/// only contribute 1/X of its cost to the total cost calculation, but when
58-
/// optimizing for code size it will just be 1 as code size costs don't depend
59-
/// on execution probabilities.
60-
///
61-
/// TODO: We should use actual block probability here, if available. Currently,
62-
/// we always assume predicated blocks have a 50% chance of executing.
63-
inline unsigned
64-
getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind) {
65-
return CostKind == TTI::TCK_CodeSize ? 1 : 2;
66-
}
67-
6853
/// A range of powers-of-2 vectorization factors with fixed start and
6954
/// adjustable end. The range includes start and excludes end, e.g.,:
7055
/// [1, 16) = {1, 2, 4, 8}
@@ -364,6 +349,10 @@ struct VPCostContext {
364349
/// has already been pre-computed.
365350
bool skipCostComputation(Instruction *UI, bool IsVector) const;
366351

352+
/// \returns how much the cost of a predicated block should be divided by.
353+
unsigned getPredBlockCostDivisor(TargetTransformInfo::TargetCostKind CostKind,
354+
BasicBlock *BB) const;
355+
367356
/// Returns the OperandInfo for \p V, if it is a live-in.
368357
TargetTransformInfo::OperandValueInfo getOperandInfo(VPValue *V) const;
369358

llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3170,7 +3170,7 @@ InstructionCost VPReplicateRecipe::computeCost(ElementCount VF,
31703170
// Scale the cost by the probability of executing the predicated blocks.
31713171
// This assumes the predicated block for each vector lane is equally
31723172
// likely.
3173-
ScalarCost /= getPredBlockCostDivisor(Ctx.CostKind);
3173+
ScalarCost /= Ctx.getPredBlockCostDivisor(Ctx.CostKind, UI->getParent());
31743174
return ScalarCost;
31753175
}
31763176
case Instruction::Load:

llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll

Lines changed: 12 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -612,63 +612,18 @@ define void @low_trip_count_fold_tail_scalarized_store(ptr %dst) {
612612
;
613613
; COMMON-LABEL: define void @low_trip_count_fold_tail_scalarized_store(
614614
; COMMON-SAME: ptr [[DST:%.*]]) {
615-
; COMMON-NEXT: [[ENTRY:.*:]]
616-
; COMMON-NEXT: br label %[[VECTOR_PH:.*]]
617-
; COMMON: [[VECTOR_PH]]:
618-
; COMMON-NEXT: br label %[[VECTOR_BODY:.*]]
619-
; COMMON: [[VECTOR_BODY]]:
620-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
621-
; COMMON: [[PRED_STORE_IF]]:
622-
; COMMON-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[DST]], i64 0
623-
; COMMON-NEXT: store i8 0, ptr [[TMP0]], align 1
624-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE]]
625-
; COMMON: [[PRED_STORE_CONTINUE]]:
626-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF1:.*]], label %[[PRED_STORE_CONTINUE2:.*]]
627-
; COMMON: [[PRED_STORE_IF1]]:
628-
; COMMON-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1
629-
; COMMON-NEXT: store i8 1, ptr [[TMP1]], align 1
630-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE2]]
631-
; COMMON: [[PRED_STORE_CONTINUE2]]:
632-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF3:.*]], label %[[PRED_STORE_CONTINUE4:.*]]
633-
; COMMON: [[PRED_STORE_IF3]]:
634-
; COMMON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[DST]], i64 2
635-
; COMMON-NEXT: store i8 2, ptr [[TMP2]], align 1
636-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE4]]
637-
; COMMON: [[PRED_STORE_CONTINUE4]]:
638-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF5:.*]], label %[[PRED_STORE_CONTINUE6:.*]]
639-
; COMMON: [[PRED_STORE_IF5]]:
640-
; COMMON-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[DST]], i64 3
641-
; COMMON-NEXT: store i8 3, ptr [[TMP3]], align 1
642-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE6]]
643-
; COMMON: [[PRED_STORE_CONTINUE6]]:
644-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
645-
; COMMON: [[PRED_STORE_IF7]]:
646-
; COMMON-NEXT: [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 4
647-
; COMMON-NEXT: store i8 4, ptr [[TMP4]], align 1
648-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE8]]
649-
; COMMON: [[PRED_STORE_CONTINUE8]]:
650-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
651-
; COMMON: [[PRED_STORE_IF9]]:
652-
; COMMON-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 5
653-
; COMMON-NEXT: store i8 5, ptr [[TMP5]], align 1
654-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE10]]
655-
; COMMON: [[PRED_STORE_CONTINUE10]]:
656-
; COMMON-NEXT: br i1 true, label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12:.*]]
657-
; COMMON: [[PRED_STORE_IF11]]:
658-
; COMMON-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 6
659-
; COMMON-NEXT: store i8 6, ptr [[TMP6]], align 1
660-
; COMMON-NEXT: br label %[[PRED_STORE_CONTINUE12]]
661-
; COMMON: [[PRED_STORE_CONTINUE12]]:
662-
; COMMON-NEXT: br i1 false, label %[[PRED_STORE_IF13:.*]], label %[[EXIT:.*]]
663-
; COMMON: [[PRED_STORE_IF13]]:
664-
; COMMON-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 7
665-
; COMMON-NEXT: store i8 7, ptr [[TMP7]], align 1
666-
; COMMON-NEXT: br label %[[EXIT]]
667-
; COMMON: [[EXIT]]:
668-
; COMMON-NEXT: br label %[[SCALAR_PH:.*]]
669-
; COMMON: [[SCALAR_PH]]:
670-
; COMMON-NEXT: br [[EXIT1:label %.*]]
671-
; COMMON: [[SCALAR_PH1:.*:]]
615+
; COMMON-NEXT: [[ENTRY:.*]]:
616+
; COMMON-NEXT: br label %[[EXIT1:.*]]
617+
; COMMON: [[EXIT1]]:
618+
; COMMON-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[EXIT1]] ]
619+
; COMMON-NEXT: [[IV_TRUNC:%.*]] = trunc i64 [[IV]] to i8
620+
; COMMON-NEXT: [[GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[IV]]
621+
; COMMON-NEXT: store i8 [[IV_TRUNC]], ptr [[GEP]], align 1
622+
; COMMON-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
623+
; COMMON-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], 7
624+
; COMMON-NEXT: br i1 [[EC]], label %[[SCALAR_PH1:.*]], label %[[EXIT1]]
625+
; COMMON: [[SCALAR_PH1]]:
626+
; COMMON-NEXT: ret void
672627
;
673628
entry:
674629
br label %loop

0 commit comments

Comments
 (0)