Skip to content

Commit 9ddc877

Browse files
author
git apple-llvm automerger
committed
Merge commit '48eb697441e2' from llvm.org/main into next
2 parents 95c0000 + 48eb697 commit 9ddc877

File tree

2 files changed

+31
-37
lines changed

2 files changed

+31
-37
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9257,6 +9257,7 @@ static InstructionCost calculateEarlyExitCost(VPCostContext &CostCtx,
92579257
/// 2. In the case of loops with uncountable early exits, we may have to do
92589258
/// extra work when exiting the loop early, such as calculating the final
92599259
/// exit values of variables used outside the loop.
9260+
/// 3. The middle block, if expected TC <= VF.Width.
92609261
static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
92619262
VectorizationFactor &VF, Loop *L,
92629263
PredicatedScalarEvolution &PSE,
@@ -9271,6 +9272,14 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
92719272
// one exists.
92729273
TotalCost += calculateEarlyExitCost(CostCtx, Plan, VF.Width);
92739274

9275+
// If the expected trip count is less than the VF, the vector loop will only
9276+
// execute a single iteration. Then the middle block is executed the same
9277+
// number of times as the vector region.
9278+
// TODO: Extend logic to always account for the cost of the middle block.
9279+
auto ExpectedTC = getSmallBestKnownTC(PSE, L);
9280+
if (ExpectedTC && ElementCount::isKnownLE(*ExpectedTC, VF.Width))
9281+
TotalCost += Plan.getMiddleBlock()->cost(VF.Width, CostCtx);
9282+
92749283
// When interleaving only scalar and vector cost will be equal, which in turn
92759284
// would lead to a divide by 0. Fall back to hard threshold.
92769285
if (VF.Width.isScalar()) {
@@ -9301,9 +9310,11 @@ static bool isOutsideLoopWorkProfitable(GeneratedRTChecks &Checks,
93019310
// The total cost of the vector loop is
93029311
// RtC + VecC * (TC / VF) + EpiC
93039312
// where
9304-
// * RtC is the cost of the generated runtime checks plus the cost of
9305-
// performing any additional work in the vector.early.exit block for loops
9306-
// with uncountable early exits.
9313+
// * RtC is the sum of the costs cost of
9314+
// - the generated runtime checks
9315+
// - performing any additional work in the vector.early.exit block for
9316+
// loops with uncountable early exits.
9317+
// - the middle block, if ExpectedTC <= VF.Width.
93079318
// * VecC is the cost of a single vector iteration.
93089319
// * TC is the actual trip count of the loop
93099320
// * VF is the vectorization factor

llvm/test/Transforms/LoopVectorize/X86/replicating-load-store-costs.ll

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -569,53 +569,36 @@ define double @test_load_used_by_other_load_scev_low_trip_count(ptr %ptr.a, ptr
569569
; I64-NEXT: [[ENTRY:.*]]:
570570
; I64-NEXT: br label %[[OUTER_LOOP:.*]]
571571
; I64: [[OUTER_LOOP_LOOPEXIT:.*]]:
572+
; I64-NEXT: [[RESULT_LCSSA:%.*]] = phi double [ [[RESULT:%.*]], %[[INNER_LOOP:.*]] ]
572573
; I64-NEXT: br label %[[OUTER_LOOP]]
573574
; I64: [[OUTER_LOOP]]:
574-
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[TMP29:%.*]], %[[OUTER_LOOP_LOOPEXIT]] ]
575+
; I64-NEXT: [[ACCUM:%.*]] = phi double [ 0.000000e+00, %[[ENTRY]] ], [ [[RESULT_LCSSA]], %[[OUTER_LOOP_LOOPEXIT]] ]
575576
; I64-NEXT: [[COND:%.*]] = call i1 @cond()
576577
; I64-NEXT: br i1 [[COND]], label %[[INNER_LOOP_PREHEADER:.*]], label %[[EXIT:.*]]
577578
; I64: [[INNER_LOOP_PREHEADER]]:
578-
; I64-NEXT: br label %[[VECTOR_PH:.*]]
579-
; I64: [[VECTOR_PH]]:
580-
; I64-NEXT: br label %[[VECTOR_BODY:.*]]
581-
; I64: [[VECTOR_BODY]]:
582-
; I64-NEXT: [[TMP0:%.*]] = add i64 0, 1
583-
; I64-NEXT: [[TMP1:%.*]] = add i64 1, 1
584-
; I64-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP0]]
579+
; I64-NEXT: br label %[[INNER_LOOP]]
580+
; I64: [[INNER_LOOP]]:
581+
; I64-NEXT: [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], %[[INNER_LOOP]] ], [ 0, %[[INNER_LOOP_PREHEADER]] ]
582+
; I64-NEXT: [[ACCUM_INNER:%.*]] = phi double [ [[MUL1:%.*]], %[[INNER_LOOP]] ], [ [[ACCUM]], %[[INNER_LOOP_PREHEADER]] ]
583+
; I64-NEXT: [[TMP1:%.*]] = add i64 [[IV]], 1
585584
; I64-NEXT: [[TMP3:%.*]] = getelementptr i8, ptr [[PTR_C]], i64 [[TMP1]]
586-
; I64-NEXT: [[TMP4:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP0]]
587585
; I64-NEXT: [[TMP5:%.*]] = getelementptr i64, ptr [[PTR_A]], i64 [[TMP1]]
588-
; I64-NEXT: [[TMP6:%.*]] = load i64, ptr [[TMP4]], align 8
589586
; I64-NEXT: [[TMP7:%.*]] = load i64, ptr [[TMP5]], align 8
590-
; I64-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP6]]
591587
; I64-NEXT: [[TMP9:%.*]] = getelementptr double, ptr [[PTR_B]], i64 [[TMP7]]
592588
; I64-NEXT: [[TMP10:%.*]] = load double, ptr [[PTR_A]], align 8
593-
; I64-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x double> poison, double [[TMP10]], i64 0
594-
; I64-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT]], <2 x double> poison, <2 x i32> zeroinitializer
595-
; I64-NEXT: [[TMP11:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], zeroinitializer
596-
; I64-NEXT: [[TMP12:%.*]] = getelementptr i8, ptr [[TMP2]], i64 8
589+
; I64-NEXT: [[ADD1:%.*]] = fadd double [[TMP10]], 0.000000e+00
597590
; I64-NEXT: [[TMP13:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
598-
; I64-NEXT: [[TMP14:%.*]] = load double, ptr [[TMP12]], align 8
599591
; I64-NEXT: [[TMP15:%.*]] = load double, ptr [[TMP13]], align 8
600-
; I64-NEXT: [[TMP16:%.*]] = insertelement <2 x double> poison, double [[TMP14]], i32 0
601-
; I64-NEXT: [[TMP17:%.*]] = insertelement <2 x double> [[TMP16]], double [[TMP15]], i32 1
602-
; I64-NEXT: [[TMP18:%.*]] = fmul <2 x double> [[TMP11]], zeroinitializer
603-
; I64-NEXT: [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[ACCUM]], i64 0
604-
; I64-NEXT: [[BROADCAST_SPLAT2:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
605-
; I64-NEXT: [[TMP19:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLAT2]], <2 x double> [[TMP18]], <2 x i32> <i32 1, i32 2>
606-
; I64-NEXT: [[TMP20:%.*]] = fmul <2 x double> [[TMP17]], zeroinitializer
607-
; I64-NEXT: [[TMP21:%.*]] = fadd <2 x double> [[TMP20]], zeroinitializer
608-
; I64-NEXT: [[TMP22:%.*]] = fadd <2 x double> [[TMP21]], splat (double 1.000000e+00)
609-
; I64-NEXT: [[TMP23:%.*]] = load double, ptr [[TMP8]], align 8
592+
; I64-NEXT: [[MUL1]] = fmul double [[ADD1]], 0.000000e+00
593+
; I64-NEXT: [[MUL2:%.*]] = fmul double [[TMP15]], 0.000000e+00
594+
; I64-NEXT: [[ADD2:%.*]] = fadd double [[MUL2]], 0.000000e+00
595+
; I64-NEXT: [[ADD3:%.*]] = fadd double [[ADD2]], 1.000000e+00
610596
; I64-NEXT: [[TMP24:%.*]] = load double, ptr [[TMP9]], align 8
611-
; I64-NEXT: [[TMP25:%.*]] = insertelement <2 x double> poison, double [[TMP23]], i32 0
612-
; I64-NEXT: [[TMP26:%.*]] = insertelement <2 x double> [[TMP25]], double [[TMP24]], i32 1
613-
; I64-NEXT: [[TMP27:%.*]] = fdiv <2 x double> [[TMP26]], [[TMP22]]
614-
; I64-NEXT: [[TMP28:%.*]] = fsub <2 x double> [[TMP19]], [[TMP27]]
615-
; I64-NEXT: br label %[[MIDDLE_BLOCK:.*]]
616-
; I64: [[MIDDLE_BLOCK]]:
617-
; I64-NEXT: [[TMP29]] = extractelement <2 x double> [[TMP28]], i32 1
618-
; I64-NEXT: br label %[[OUTER_LOOP_LOOPEXIT]]
597+
; I64-NEXT: [[DIV:%.*]] = fdiv double [[TMP24]], [[ADD3]]
598+
; I64-NEXT: [[RESULT]] = fsub double [[ACCUM_INNER]], [[DIV]]
599+
; I64-NEXT: [[IV_NEXT]] = add i64 [[IV]], 1
600+
; I64-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[IV]], 1
601+
; I64-NEXT: br i1 [[EXITCOND]], label %[[OUTER_LOOP_LOOPEXIT]], label %[[INNER_LOOP]]
619602
; I64: [[EXIT]]:
620603
; I64-NEXT: ret double [[ACCUM]]
621604
;

0 commit comments

Comments
 (0)