Skip to content

Commit f56c432

Browse files
committed
Give cond. loop threshold bonus to outer loop in loop nests
1 parent 95c2d79 commit f56c432

File tree

2 files changed

+151
-1
lines changed

2 files changed

+151
-1
lines changed

llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp

Lines changed: 67 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,13 @@ static cl::opt<unsigned> UnrollThresholdIf(
4747
cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
4848
cl::init(200), cl::Hidden);
4949

50+
static cl::opt<unsigned> UnrollThresholdNestedStatic(
51+
"amdgpu-unroll-threshold-nested-static",
52+
cl::desc("Unroll threshold increment for AMDGPU for each nested loop whose "
53+
"trip count will be made runtime-independent when fully-unrolling "
54+
"the outer loop"),
55+
cl::init(200), cl::Hidden);
56+
5057
static cl::opt<bool> UnrollRuntimeLocal(
5158
"amdgpu-unroll-runtime-local",
5259
cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
@@ -148,8 +155,67 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
148155
}
149156
}
150157
}
151-
152158
unsigned MaxBoost = std::max(ThresholdPrivate, ThresholdLocal);
159+
160+
if (llvm::PHINode *IV = L->getInductionVariable(SE)) {
161+
// Look for subloops whose trip count would go from runtime-dependent to
162+
// runtime-independent if we were to unroll the loop. Give a bonus to the
163+
// current loop's unrolling threshold for each of these, as fully unrolling
164+
// it would likely expose additional optimization opportunities.
165+
for (const Loop *SubLoop : L->getSubLoops()) {
166+
std::optional<Loop::LoopBounds> Bounds = SubLoop->getBounds(SE);
167+
if (!Bounds)
168+
continue;
169+
Value *InitIV = &Bounds->getInitialIVValue();
170+
Value *FinalIV = &Bounds->getFinalIVValue();
171+
Value *StepVal = Bounds->getStepValue();
172+
if (!StepVal)
173+
continue;
174+
175+
// Determines whether SubIV's derivation depends exclusively on constants
176+
// and/or IV; if it does, SubIVDependsOnIV is set to true if IV is
177+
// involved in the derivation.
178+
bool SubIVDependsOnIV = false;
179+
std::function<bool(const Value *, unsigned)> FromConstsOrLoopIV =
180+
[&](const Value *SubIV, unsigned Depth) -> bool {
181+
if (SubIV == IV) {
182+
SubIVDependsOnIV = true;
183+
return true;
184+
}
185+
if (isa<Constant>(SubIV))
186+
return true;
187+
if (Depth >= 10)
188+
return false;
189+
190+
const Instruction *I = dyn_cast<Instruction>(SubIV);
191+
// No point in checking outside the loop since IV is necessarily inside
192+
// it; also stop searching when encountering an instruction that will
193+
// likely not allow SubIV's value to be statically computed.
194+
if (!I || !L->contains(I) || !isa<BinaryOperator, CastInst, PHINode>(I))
195+
return false;
196+
197+
// SubIV depends on constants or IV if all of the instruction's
198+
// operands involved in its derivation also depend on constants or IV.
199+
return llvm::all_of(I->operand_values(), [&](const Value *V) {
200+
return FromConstsOrLoopIV(V, Depth + 1);
201+
});
202+
};
203+
204+
if (FromConstsOrLoopIV(InitIV, 0) && FromConstsOrLoopIV(FinalIV, 0) &&
205+
FromConstsOrLoopIV(StepVal, 0) && SubIVDependsOnIV) {
206+
UP.Threshold += UnrollThresholdNestedStatic;
207+
LLVM_DEBUG(dbgs() << "Set unroll threshold " << UP.Threshold
208+
<< " for loop:\n"
209+
<< *L
210+
<< " due to subloop's trip count becoming "
211+
"runtime-independent after unrolling:\n "
212+
<< *SubLoop);
213+
if (UP.Threshold >= MaxBoost)
214+
return;
215+
}
216+
}
217+
}
218+
153219
for (const BasicBlock *BB : L->getBlocks()) {
154220
const DataLayout &DL = BB->getDataLayout();
155221
unsigned LocalGEPsSeen = 0;
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
; RUN: opt -S -mtriple=amdgcn-- -passes=loop-unroll -debug-only=AMDGPUtti < %s 2>&1 | FileCheck %s
2+
3+
; For @dependent_sub_fullunroll, the threshold bonus should apply
4+
; CHECK: due to subloop's trip count becoming runtime-independent after unrolling
5+
6+
; For @dependent_sub_no_fullunroll, the threshold bonus should not apply
7+
; CHECK-NOT: due to subloop's trip count becoming runtime-independent after unrolling
8+
9+
; Check that the outer loop of a double-nested loop where the inner loop's trip
10+
; count depends exclusively on constants and the outer IV is fully unrolled
11+
; thanks to receiving a threshold bonus in AMDGPU's TTI.
12+
13+
; CHECK-LABEL: @dependent_sub_fullunroll
14+
; CHECK: inner.header_latch_exiting.7
15+
; CHECK: outer.latch_exiting.7
16+
17+
define void @dependent_sub_fullunroll(ptr noundef %mem) {
18+
entry:
19+
br label %outer.header
20+
21+
outer.header: ; preds = %entry, %outer.latch_exiting
22+
%outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
23+
br label %inner.header_latch_exiting
24+
25+
inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
26+
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
27+
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
28+
%outer.iv.ext = zext nneg i32 %outer.iv to i64
29+
%idx_part = mul nuw nsw i64 %outer.iv.ext, 16
30+
%inner.iv.ext = zext nneg i32 %inner.iv to i64
31+
%idx = add nuw nsw i64 %idx_part, %inner.iv.ext
32+
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
33+
store i32 0, ptr %addr
34+
%inner.cond = icmp ult i32 %inner.iv_next, 8
35+
br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
36+
37+
outer.latch_exiting: ; preds = %inner.header_latch_exiting
38+
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
39+
%outer.cond = icmp ult i32 %outer.iv_next, 8
40+
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
41+
42+
end: ; preds = %outer.latch_exiting
43+
ret void
44+
}
45+
46+
; Check that the outer loop of the same loop nest as dependent_sub_fullunroll
47+
; is not fully unrolled when the inner loop's final IV value depends on a
48+
; function argument instead of a combination of the outer IV and constants.
49+
50+
; CHECK-LABEL: @dependent_sub_no_fullunroll
51+
; CHECK-NOT: outer.latch_exiting.7
52+
; CHECK-NOT: outer.latch_exiting.7
53+
54+
define void @dependent_sub_no_fullunroll(ptr noundef %mem, i32 noundef %inner.ub) {
55+
entry:
56+
br label %outer.header
57+
58+
outer.header: ; preds = %entry, %outer.latch_exiting
59+
%outer.iv = phi i32 [ 0, %entry ], [ %outer.iv_next, %outer.latch_exiting ]
60+
br label %inner.header_latch_exiting
61+
62+
inner.header_latch_exiting: ; preds = %outer.header, %inner.header_latch_exiting
63+
%inner.iv = phi i32 [ %outer.iv, %outer.header ], [ %inner.iv_next, %inner.header_latch_exiting ]
64+
%inner.iv_next = add nuw nsw i32 %inner.iv, 1
65+
%outer.iv.ext = zext nneg i32 %outer.iv to i64
66+
%idx_part = mul nuw nsw i64 %outer.iv.ext, 16
67+
%inner.iv.ext = zext nneg i32 %inner.iv to i64
68+
%idx = add nuw nsw i64 %idx_part, %inner.iv.ext
69+
%addr = getelementptr inbounds i8, ptr %mem, i64 %idx
70+
store i32 0, ptr %addr
71+
%inner.cond = icmp ult i32 %inner.iv_next, %inner.ub
72+
br i1 %inner.cond, label %inner.header_latch_exiting, label %outer.latch_exiting, !llvm.loop !1
73+
74+
outer.latch_exiting: ; preds = %inner.header_latch_exiting
75+
%outer.iv_next = add nuw nsw i32 %outer.iv, 1
76+
%outer.cond = icmp ult i32 %outer.iv_next, 8
77+
br i1 %outer.cond, label %outer.header, label %end, !llvm.loop !1
78+
79+
end: ; preds = %outer.latch_exiting
80+
ret void
81+
}
82+
83+
!1 = !{!1, !2}
84+
!2 = !{!"amdgpu.loop.unroll.threshold", i32 100}

0 commit comments

Comments
 (0)