Skip to content

Commit 243c8d7

Browse files
committed
[LV] Count cost of middle block if TC <= VF.
If the expected trip count is less than the VF, the vector loop will only execute a single iteration. When that's the case, the cost of the middle block has the same impact as the cost of the vector loop. Include it in isOutsideLoopWorkProfitable to avoid vectorizing when the extra work in the middle block makes it unprofitable. Note that isOutsideLoopWorkProfitable already scales the cost of blocks outside the vector region, but the patch restricts accounting for the middle block to cases where VF <= ExpectedTC, to initially catch some worst cases and avoid regressions. This initial version should specifically avoid unprofitable tail-folding for loops with low trip counts after re-applying llvm#149042.
1 parent a3f6c43 commit 243c8d7

File tree

2 files changed

+25
-34
lines changed

2 files changed

+25
-34
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9283,6 +9283,14 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
92839283
// one exists.
92849284
TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
92859285

9286+
// If the expected trip count is less than the VF, the vector loop will only
9287+
// execute a single iteration. Then the middle block is executed the same
9288+
// number of times as the vector region.
9289+
// TODO: Extend logic to always account for the cost of the middle block.
9290+
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9291+
if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width))
9292+
TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
9293+
92869294
// When interleaving only scalar and vector cost will be equal, which in turn
92879295
// would lead to a divide by 0. Fall back to hard threshold.
92889296
if (VF.Width.isScalar()) {

llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -569,53 +569,36 @@ define double @test_load_used_by_other_load_scev_low_trip_count(ptr %ptr.a, ptr
569569
; I64-NEXT: [[ENTRY:.*]]:
570570
; I64-NEXT: br label %[[OUTER_LOOP:.*]]
571571
; I64: [[OUTER_LOOP_LOOPEXIT:.*]]:
572+
; I64-NEXT: [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ]
572573
; I64-NEXT: br label %[[OUTER_LOOP]]
573574
; I64: [[OUTER_LOOP]]:
574-
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ]
575+
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ]
575576
; I64-NEXT: [[COND:%.*]] = call i1 @cond()
576577
; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
577578
; I64: [[INNER_LOOP_PREHEADER]]:
578-
; I64-NEXT: br label %[[VECTOR_PH:.*]]
579-
; I64: [[VECTOR_PH]]:
580-
; I64-NEXT: br label %[[VECTOR_BODY:.*]]
581-
; I64: [[VECTOR_BODY]]:
582-
; I64-NEXT: [[TMP0:%.*]] = add i64 0, 1
583-
; I64-NEXT: [[TMP1:%.*]] = add i64 1, 1
584-
; I64-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]]
579+
; I64-NEXT: br label %[[INNER_LOOP]]
580+
; I64: [[INNER_LOOP]]:
581+
; I64-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[INNER_LOOP_PREHEADER]] ]
582+
; I64-NEXT: [[ACCUM_INNER:%.*]] = phi double [ [[MUL1:%.*]], %[[INNER_LOOP]] ], [ [[ACCUM]], %[[INNER_LOOP_PREHEADER]] ]
583+
; I64-NEXT: [[TMP1:%.*]] = add i64 [[IV]], 1
585584
; I64-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]]
586-
; I64-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]]
587585
; I64-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]]
588-
; I64-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
589586
; I64-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
590-
; I64-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]]
591587
; I64-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
592588
; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
593-
; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
594-
; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
595-
; I64-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer
596-
; I64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
589+
; I64-NEXT: [[ADD1:%.*]] = fadd double [[TMP10]], 0.000000e+00
597590
; I64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
598-
; I64-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8
599591
; I64-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
600-
; I64-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0
601-
; I64-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1
602-
; I64-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer
603-
; I64-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0
604-
; I64-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
605-
; I64-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> <i32 1, i32 2>
606-
; I64-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer
607-
; I64-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer
608-
; I64-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00)
609-
; I64-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8
592+
; I64-NEXT: [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
593+
; I64-NEXT: [[MUL2:%.*]] = fmul double [[TMP15]], 0.000000e+00
594+
; I64-NEXT: [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
595+
; I64-NEXT: [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
610596
; I64-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
611-
; I64-NEXT: [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0
612-
; I64-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
613-
; I64-NEXT: [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
614-
; I64-NEXT: [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
615-
; I64-NEXT: br label %[[MIDDLE_BLOCK:.*]]
616-
; I64: [[MIDDLE_BLOCK]]:
617-
; I64-NEXT: [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
618-
; I64-NEXT: br label %[[OUTER_LOOP_LOOPEXIT]]
597+
; I64-NEXT: [[DIV:%.*]] = fdiv double [[TMP24]], [[ADD3]]
598+
; I64-NEXT: [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]]
599+
; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
600+
; I64-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
601+
; I64-NEXT: br i1 [[EXITCOND]], label %[[OUTER_LOOP_LOOPEXIT]], label %[[INNER_LOOP]]
619602
; I64: [[EXIT]]:
620603
; I64-NEXT: ret double [[ACCUM]]
621604
;

0 commit comments

Comments
 (0)