@@ -4568,6 +4568,71 @@ getFalkorUnrollingPreferences(Loop *L, ScalarEvolution &SE,
45684568 }
45694569}
45704570
4571+ // This function returns true if the loop:
4572+ // 1. Has a valid cost, and
4573+ // 2. Has a cost within the supplied budget.
4574+ // Otherwise it returns false.
4575+ static bool isLoopSizeWithinBudget (Loop *L, AArch64TTIImpl &TTI,
4576+ InstructionCost Budget,
4577+ unsigned *FinalSize) {
4578+ // Estimate the size of the loop.
4579+ InstructionCost LoopCost = 0 ;
4580+
4581+ for (auto *BB : L->getBlocks ()) {
4582+ for (auto &I : *BB) {
4583+ SmallVector<const Value *, 4 > Operands (I.operand_values ());
4584+ InstructionCost Cost =
4585+ TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize);
4586+ // This can happen with intrinsics that don't currently have a cost model
4587+ // or for some operations that require SVE.
4588+ if (!Cost.isValid ())
4589+ return false ;
4590+
4591+ LoopCost += Cost;
4592+ if (LoopCost > Budget)
4593+ return false ;
4594+ }
4595+ }
4596+
4597+ if (FinalSize)
4598+ *FinalSize = *LoopCost.getValue ();
4599+ return true ;
4600+ }
4601+
4602+ static bool shouldUnrollMultiExitLoop (Loop *L, ScalarEvolution &SE,
4603+ AArch64TTIImpl &TTI) {
4604+ // Only consider loops with unknown trip counts for which we can determine
4605+ // a symbolic expression. Multi-exit loops with small known trip counts will
4606+ // likely be unrolled anyway.
4607+ const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
4608+ if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC))
4609+ return false ;
4610+
4611+ // It might not be worth unrolling loops with low max trip counts. Restrict
4612+ // this to max trip counts > 32 for now.
4613+ unsigned MaxTC = SE.getSmallConstantMaxTripCount (L);
4614+ if (MaxTC > 0 && MaxTC <= 32 )
4615+ return false ;
4616+
4617+ // Make sure the loop size is <= 5.
4618+ if (!isLoopSizeWithinBudget (L, TTI, 5 , nullptr ))
4619+ return false ;
4620+
4621+ // Small search loops with multiple exits can be highly beneficial to unroll.
4622+ // We only care about loops with exactly two exiting blocks, although each
4623+ // block could jump to the same exit block.
4624+ ArrayRef<BasicBlock *> Blocks = L->getBlocks ();
4625+ if (Blocks.size () != 2 )
4626+ return false ;
4627+
4628+ if (any_of (Blocks, [](BasicBlock *BB) {
4629+ return !isa<BranchInst>(BB->getTerminator ());
4630+ }))
4631+ return false ;
4632+
4633+ return true ;
4634+ }
4635+
45714636// / For Apple CPUs, we want to runtime-unroll loops to make better use if the
45724637// / OOO engine's wide instruction window and various predictors.
45734638static void
@@ -4582,43 +4647,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
45824647 if (!L->isInnermost () || L->getNumBlocks () > 8 )
45834648 return ;
45844649
4650+ // Loops with multiple exits are handled by common code.
4651+ if (!L->getExitBlock ())
4652+ return ;
4653+
45854654 const SCEV *BTC = SE.getSymbolicMaxBackedgeTakenCount (L);
45864655 if (isa<SCEVConstant>(BTC) || isa<SCEVCouldNotCompute>(BTC) ||
45874656 (SE.getSmallConstantMaxTripCount (L) > 0 &&
45884657 SE.getSmallConstantMaxTripCount (L) <= 32 ))
45894658 return ;
4590- if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
4591- return ;
4592-
4593- int64_t Size = 0 ;
4594- for (auto *BB : L->getBlocks ()) {
4595- for (auto &I : *BB) {
4596- if (!isa<IntrinsicInst>(&I) && isa<CallBase>(&I))
4597- return ;
4598- SmallVector<const Value *, 4 > Operands (I.operand_values ());
4599- Size +=
4600- *TTI.getInstructionCost (&I, Operands, TTI::TCK_CodeSize).getValue ();
4601- }
4602- }
46034659
4604- // Small search loops with multiple exits can be highly beneficial to unroll.
4605- if (!L->getExitBlock ()) {
4606- if (L->getNumBlocks () == 2 && Size < 6 &&
4607- all_of (
4608- L->getBlocks (),
4609- [](BasicBlock *BB) {
4610- return isa<BranchInst>(BB->getTerminator ());
4611- })) {
4612- UP.RuntimeUnrollMultiExit = true ;
4613- UP.Runtime = true ;
4614- // Limit unroll count.
4615- UP.DefaultUnrollRuntimeCount = 4 ;
4616- // Allow slightly more costly trip-count expansion to catch search loops
4617- // with pointer inductions.
4618- UP.SCEVExpansionBudget = 5 ;
4619- }
4660+ if (findStringMetadataForLoop (L, " llvm.loop.isvectorized" ))
46204661 return ;
4621- }
46224662
46234663 if (SE.getSymbolicMaxBackedgeTakenCount (L) != SE.getBackedgeTakenCount (L))
46244664 return ;
@@ -4630,7 +4670,9 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
46304670 // dependencies, to expose more parallel memory access streams.
46314671 BasicBlock *Header = L->getHeader ();
46324672 if (Header == L->getLoopLatch ()) {
4633- if (Size > 8 )
4673+ // Estimate the size of the loop.
4674+ unsigned Size;
4675+ if (!isLoopSizeWithinBudget (L, TTI, 8 , &Size))
46344676 return ;
46354677
46364678 SmallPtrSet<Value *, 8 > LoadedValues;
@@ -4727,6 +4769,25 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
47274769 // Disable partial & runtime unrolling on -Os.
47284770 UP.PartialOptSizeThreshold = 0 ;
47294771
4772+ // Scan the loop: don't unroll loops with calls as this could prevent
4773+ // inlining. Don't unroll vector loops either, as they don't benefit much from
4774+ // unrolling.
4775+ for (auto *BB : L->getBlocks ()) {
4776+ for (auto &I : *BB) {
4777+ // Don't unroll vectorised loop.
4778+ if (I.getType ()->isVectorTy ())
4779+ return ;
4780+
4781+ if (isa<CallBase>(I)) {
4782+ if (isa<CallInst>(I) || isa<InvokeInst>(I))
4783+ if (const Function *F = cast<CallBase>(I).getCalledFunction ())
4784+ if (!isLoweredToCall (F))
4785+ continue ;
4786+ return ;
4787+ }
4788+ }
4789+ }
4790+
47304791 // Apply subtarget-specific unrolling preferences.
47314792 switch (ST->getProcFamily ()) {
47324793 case AArch64Subtarget::AppleA14:
@@ -4743,23 +4804,17 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
47434804 break ;
47444805 }
47454806
4746- // Scan the loop: don't unroll loops with calls as this could prevent
4747- // inlining. Don't unroll vector loops either, as they don't benefit much from
4748- // unrolling.
4749- for (auto *BB : L->getBlocks ()) {
4750- for (auto &I : *BB) {
4751- // Don't unroll vectorised loop.
4752- if (I.getType ()->isVectorTy ())
4753- return ;
4754-
4755- if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
4756- if (const Function *F = cast<CallBase>(I).getCalledFunction ()) {
4757- if (!isLoweredToCall (F))
4758- continue ;
4759- }
4760- return ;
4761- }
4762- }
4807+ // If this is a small, multi-exit loop similar to something like std::find,
4808+ // then there is typically a performance improvement achieved by unrolling.
4809+ if (!L->getExitBlock () && shouldUnrollMultiExitLoop (L, SE, *this )) {
4810+ UP.RuntimeUnrollMultiExit = true ;
4811+ UP.Runtime = true ;
4812+ // Limit unroll count.
4813+ UP.DefaultUnrollRuntimeCount = 4 ;
4814+ // Allow slightly more costly trip-count expansion to catch search loops
4815+ // with pointer inductions.
4816+ UP.SCEVExpansionBudget = 5 ;
4817+ return ;
47634818 }
47644819
47654820 // Enable runtime unrolling for in-order models
0 commit comments