Skip to content

Commit 48eb697

Browse files
authored
[LV] Count cost of middle block if TC <= VF. (#168949)
If the expected trip count is less than the VF, the vector loop will only execute a single iteration. When that's the case, the cost of the middle block has the same impact as the cost of the vector loop. Include it in isOutsideLoopWorkProfitable to avoid vectorizing when the extra work in the middle block makes it unprofitable. Note that isOutsideLoopWorkProfitable already scales the cost of blocks outside the vector region, but the patch restricts accounting for the middle block to cases where VF <= ExpectedTC, to initially catch some worst cases and avoid regressions. This initial version should specifically avoid unprofitable tail-folding for loops with low trip counts after re-applying #149042. PR: #168949
1 parent e92bb83 commit 48eb697

File tree

2 files changed

+31
-37
lines changed

2 files changed

+31
-37
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9257,6 +9257,7 @@ static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
92579257
/// 2. In the case of loops with uncountable early exits, we may have to do
92589258
/// extra work when exiting the loop early, such as calculating the final
92599259
/// exit values of variables used outside the loop.
9260+
/// 3. The middle block, if expected TC <= VF.Width.
92609261
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
92619262
VectorizationFactor &VF, Loop *L,
92629263
PredicatedScalarEvolution &PSE,
@@ -9271,6 +9272,14 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
92719272
// one exists.
92729273
TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
92739274

9275+
// If the expected trip count is less than the VF, the vector loop will only
9276+
// execute a single iteration. Then the middle block is executed the same
9277+
// number of times as the vector region.
9278+
// TODO: Extend logic to always account for the cost of the middle block.
9279+
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9280+
if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width))
9281+
TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
9282+
92749283
// When interleaving only scalar and vector cost will be equal, which in turn
92759284
// would lead to a divide by 0. Fall back to hard threshold.
92769285
if (VF.Width.isScalar()) {
@@ -9301,9 +9310,11 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
93019310
// The total cost of the vector loop is
93029311
// RtC + VecC * (TC / VF) + EpiC
93039312
// where
9304-
// * RtC is the cost of the generated runtime checks plus the cost of
9305-
// performing any additional work in the vector.early.exit block for loops
9306-
// with uncountable early exits.
9313+
// * RtC is the sum of the costs cost of
9314+
// - the generated runtime checks
9315+
// - performing any additional work in the vector.early.exit block for
9316+
// loops with uncountable early exits.
9317+
// - the middle block, if ExpectedTC <= VF.Width.
93079318
// * VecC is the cost of a single vector iteration.
93089319
// * TC is the actual trip count of the loop
93099320
// * VF is the vectorization factor

llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -569,53 +569,36 @@ define double @test_load_used_by_other_load_scev_low_trip_count(ptr %ptr.a, ptr
569569
; I64-NEXT: [[ENTRY:.*]]:
570570
; I64-NEXT: br label %[[OUTER_LOOP:.*]]
571571
; I64: [[OUTER_LOOP_LOOPEXIT:.*]]:
572+
; I64-NEXT: [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ]
572573
; I64-NEXT: br label %[[OUTER_LOOP]]
573574
; I64: [[OUTER_LOOP]]:
574-
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ]
575+
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ]
575576
; I64-NEXT: [[COND:%.*]] = call i1 @cond()
576577
; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
577578
; I64: [[INNER_LOOP_PREHEADER]]:
578-
; I64-NEXT: br label %[[VECTOR_PH:.*]]
579-
; I64: [[VECTOR_PH]]:
580-
; I64-NEXT: br label %[[VECTOR_BODY:.*]]
581-
; I64: [[VECTOR_BODY]]:
582-
; I64-NEXT: [[TMP0:%.*]] = add i64 0, 1
583-
; I64-NEXT: [[TMP1:%.*]] = add i64 1, 1
584-
; I64-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]]
579+
; I64-NEXT: br label %[[INNER_LOOP]]
580+
; I64: [[INNER_LOOP]]:
581+
; I64-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[INNER_LOOP_PREHEADER]] ]
582+
; I64-NEXT: [[ACCUM_INNER:%.*]] = phi double [ [[MUL1:%.*]], %[[INNER_LOOP]] ], [ [[ACCUM]], %[[INNER_LOOP_PREHEADER]] ]
583+
; I64-NEXT: [[TMP1:%.*]] = add i64 [[IV]], 1
585584
; I64-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]]
586-
; I64-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]]
587585
; I64-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]]
588-
; I64-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
589586
; I64-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
590-
; I64-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]]
591587
; I64-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
592588
; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
593-
; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
594-
; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
595-
; I64-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer
596-
; I64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
589+
; I64-NEXT: [[ADD1:%.*]] = fadd double [[TMP10]], 0.000000e+00
597590
; I64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
598-
; I64-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8
599591
; I64-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
600-
; I64-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0
601-
; I64-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1
602-
; I64-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer
603-
; I64-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0
604-
; I64-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
605-
; I64-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> <i32 1, i32 2>
606-
; I64-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer
607-
; I64-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer
608-
; I64-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00)
609-
; I64-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8
592+
; I64-NEXT: [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
593+
; I64-NEXT: [[MUL2:%.*]] = fmul double [[TMP15]], 0.000000e+00
594+
; I64-NEXT: [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
595+
; I64-NEXT: [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
610596
; I64-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
611-
; I64-NEXT: [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0
612-
; I64-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
613-
; I64-NEXT: [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
614-
; I64-NEXT: [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
615-
; I64-NEXT: br label %[[MIDDLE_BLOCK:.*]]
616-
; I64: [[MIDDLE_BLOCK]]:
617-
; I64-NEXT: [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
618-
; I64-NEXT: br label %[[OUTER_LOOP_LOOPEXIT]]
597+
; I64-NEXT: [[DIV:%.*]] = fdiv double [[TMP24]], [[ADD3]]
598+
; I64-NEXT: [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]]
599+
; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
600+
; I64-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
601+
; I64-NEXT: br i1 [[EXITCOND]], label %[[OUTER_LOOP_LOOPEXIT]], label %[[INNER_LOOP]]
619602
; I64: [[EXIT]]:
620603
; I64-NEXT: ret double [[ACCUM]]
621604
;

0 commit comments

Comments
 (0)