Skip to content

Commit 64db7f6

Browse files
[SimpleLoopUnswitch] Adjust cost multiplier accounting for parent loop size
When estimating the cost to avoid exponential unswitches of non-trivial invariant conditions, also consider the parent loop basic blocks size, ensuring this does not grow unexpectedly. Fixes: #138509.
1 parent 8704e55 commit 64db7f6

File tree

2 files changed

+76
-5
lines changed

2 files changed

+76
-5
lines changed

llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp

Lines changed: 22 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,9 @@ static cl::opt<bool> EnableUnswitchCostMultiplier(
9898
static cl::opt<int> UnswitchSiblingsToplevelDiv(
9999
"unswitch-siblings-toplevel-div", cl::init(2), cl::Hidden,
100100
cl::desc("Toplevel siblings divisor for cost multiplier."));
101+
static cl::opt<int> UnswitchParentBlocksDiv(
102+
"unswitch-parent-blocks-div", cl::init(8), cl::Hidden,
103+
cl::desc("Outer loop size divisor for cost multiplier."));
101104
static cl::opt<int> UnswitchNumInitialUnscaledCandidates(
102105
"unswitch-num-initial-unscaled-candidates", cl::init(8), cl::Hidden,
103106
cl::desc("Number of unswitch candidates that are ignored when calculating "
@@ -2809,9 +2812,9 @@ static BranchInst *turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
28092812
}
28102813

28112814
/// Cost multiplier is a way to limit potentially exponential behavior
2812-
/// of loop-unswitch. Cost is multipied in proportion of 2^number of unswitch
2813-
/// candidates available. Also accounting for the number of "sibling" loops with
2814-
/// the idea to account for previous unswitches that already happened on this
2815+
/// of loop-unswitch. Cost is multiplied in proportion of 2^number of unswitch
2816+
/// candidates available. Also consider the number of "sibling" loops with
2817+
/// the idea of accounting for previous unswitches that already happened on this
28152818
/// cluster of loops. There was an attempt to keep this formula simple,
28162819
/// just enough to limit the worst case behavior. Even if it is not that simple
28172820
/// now it is still not an attempt to provide a detailed heuristic size
@@ -2842,7 +2845,19 @@ static int CalculateUnswitchCostMultiplier(
28422845
return 1;
28432846
}
28442847

2848+
// Each invariant non-trivial condition, after being unswitched, is supposed
2849+
// to have its own specialized sibling loop (the invariant condition has been
2850+
// hoisted out of the child loop into a newly-cloned loop). When unswitching
2851+
// conditions in nested loops, the basic block size of the outer loop should
2852+
// not be altered. If such a size significantly increases across unswitching
2853+
// invocations, something may be wrong; so adjust the final cost taking this
2854+
// into account.
28452855
auto *ParentL = L.getParentLoop();
2856+
int ParentLoopSizeMultiplier = 1;
2857+
if (ParentL)
2858+
ParentLoopSizeMultiplier =
2859+
std::max<int>(ParentL->getNumBlocks() / UnswitchParentBlocksDiv, 1);
2860+
28462861
int SiblingsCount = (ParentL ? ParentL->getSubLoopsVector().size()
28472862
: std::distance(LI.begin(), LI.end()));
28482863
// Count amount of clones that all the candidates might cause during
@@ -2887,14 +2902,16 @@ static int CalculateUnswitchCostMultiplier(
28872902
// at an upper bound.
28882903
int CostMultiplier;
28892904
if (ClonesPower > Log2_32(UnswitchThreshold) ||
2890-
SiblingsMultiplier > UnswitchThreshold)
2905+
SiblingsMultiplier > UnswitchThreshold ||
2906+
ParentLoopSizeMultiplier > UnswitchThreshold)
28912907
CostMultiplier = UnswitchThreshold;
28922908
else
28932909
CostMultiplier = std::min(SiblingsMultiplier * (1 << ClonesPower),
28942910
(int)UnswitchThreshold);
28952911

28962912
LLVM_DEBUG(dbgs() << " Computed multiplier " << CostMultiplier
2897-
<< " (siblings " << SiblingsMultiplier << " * clones "
2913+
<< " (siblings " << SiblingsMultiplier << " * parent size "
2914+
<< ParentLoopSizeMultiplier << " * clones "
28982915
<< (1 << ClonesPower) << ")"
28992916
<< " for unswitch candidate: " << TI << "\n");
29002917
return CostMultiplier;
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
2+
; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=1 \
3+
; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch<nontrivial>),print<loops>" \
4+
; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-1
5+
6+
; RUN: opt < %s -S -enable-unswitch-cost-multiplier=true -unswitch-parent-blocks-div=2 \
7+
; RUN: -passes="loop-mssa(loop-simplifycfg,licm,loop-rotate,simple-loop-unswitch<nontrivial>),print<loops>" \
8+
; RUN: -disable-output 2>&1 | sort -b -k 1 | FileCheck %s --check-prefixes=LOOP-DIV-2
9+
10+
; LOOP-DIV-1-COUNT-6: Loop at depth 1 containing:
11+
; LOOP-DIV-2-COUNT-12: Loop at depth 1 containing:
12+
13+
@a = global i32 0, align 4
14+
@b = global i32 0, align 4
15+
@c = global i32 0, align 4
16+
@d = global i32 0, align 4
17+
18+
define i32 @main() {
19+
entry:
20+
br label %outer.loop.header
21+
22+
outer.loop.header: ; preds = %outer.loop.latch, %entry
23+
br i1 false, label %exit, label %outer.loop.body
24+
25+
outer.loop.body: ; preds = %inner.loop.header, %outer.loop.header
26+
store i32 1, ptr @c, align 4
27+
%cmp = icmp sgt i32 0, -1
28+
br i1 %cmp, label %outer.loop.latch, label %exit
29+
30+
inner.loop.header: ; preds = %outer.loop.latch, %inner.loop.body
31+
%a_val = load i32, ptr @a, align 4
32+
%c_val = load i32, ptr @c, align 4
33+
%mul = mul nsw i32 %c_val, %a_val
34+
store i32 %mul, ptr @b, align 4
35+
%cmp2 = icmp sgt i32 %mul, -1
36+
br i1 %cmp2, label %inner.loop.body, label %outer.loop.body
37+
38+
inner.loop.body: ; preds = %inner.loop.header
39+
%mul2 = mul nsw i32 %c_val, 3
40+
store i32 %mul2, ptr @c, align 4
41+
store i32 %c_val, ptr @d, align 4
42+
%mul3 = mul nsw i32 %c_val, %a_val
43+
%cmp3 = icmp sgt i32 %mul3, -1
44+
br i1 %cmp3, label %inner.loop.header, label %exit
45+
46+
outer.loop.latch: ; preds = %outer.loop.body
47+
%d_val = load i32, ptr @d, align 4
48+
store i32 %d_val, ptr @b, align 4
49+
%cmp4 = icmp eq i32 %d_val, 0
50+
br i1 %cmp4, label %inner.loop.header, label %outer.loop.header
51+
52+
exit: ; preds = %inner.loop.body, %outer.loop.body, %outer.loop.header
53+
ret i32 0
54+
}

0 commit comments

Comments
 (0)