Skip to content

Commit 1f2fb8e

Browse files
authored
[AArch64] Tune unrolling prefs for more patterns on Apple CPUs (#149358)
Enhance the heuristics in `getAppleRuntimeUnrollPreferences` to let a bit more loops to be unrolled. Specifically, this patch adjusts two checks: I. Tune the loop size budget from 8 to 10 II. Include immediate in-loop users of loaded values in the load/stores dependencies predicate --------- Co-authored-by: Florian Hahn <[email protected]> PR: llvm/llvm-project#149358
1 parent bdddff2 commit 1f2fb8e

File tree

2 files changed

+213
-9
lines changed

2 files changed

+213
-9
lines changed

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Lines changed: 15 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -4893,16 +4893,18 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
48934893
// Limit to loops with trip counts that are cheap to expand.
48944894
UP.SCEVExpansionBudget = 1;
48954895

4896-
// Try to unroll small, single block loops, if they have load/store
4897-
// dependencies, to expose more parallel memory access streams.
4896+
// Try to unroll small loops, of few-blocks with low budget, if they have
4897+
// load/store dependencies, to expose more parallel memory access streams,
4898+
// or if they do little work inside a block (i.e. load -> X -> store pattern).
48984899
BasicBlock *Header = L->getHeader();
48994900
if (Header == L->getLoopLatch()) {
49004901
// Estimate the size of the loop.
49014902
unsigned Size;
4902-
if (!isLoopSizeWithinBudget(L, TTI, 8, &Size))
4903+
unsigned Width = 10;
4904+
if (!isLoopSizeWithinBudget(L, TTI, Width, &Size))
49034905
return;
49044906

4905-
SmallPtrSet<Value *, 8> LoadedValues;
4907+
SmallPtrSet<Value *, 8> LoadedValuesPlus;
49064908
SmallVector<StoreInst *> Stores;
49074909
for (auto *BB : L->blocks()) {
49084910
for (auto &I : *BB) {
@@ -4912,9 +4914,13 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49124914
const SCEV *PtrSCEV = SE.getSCEV(Ptr);
49134915
if (SE.isLoopInvariant(PtrSCEV, L))
49144916
continue;
4915-
if (isa<LoadInst>(&I))
4916-
LoadedValues.insert(&I);
4917-
else
4917+
if (isa<LoadInst>(&I)) {
4918+
LoadedValuesPlus.insert(&I);
4919+
// Include in-loop 1st users of loaded values.
4920+
for (auto *U : I.users())
4921+
if (L->contains(cast<Instruction>(U)))
4922+
LoadedValuesPlus.insert(U);
4923+
} else
49184924
Stores.push_back(cast<StoreInst>(&I));
49194925
}
49204926
}
@@ -4937,8 +4943,8 @@ getAppleRuntimeUnrollPreferences(Loop *L, ScalarEvolution &SE,
49374943
UC++;
49384944
}
49394945

4940-
if (BestUC == 1 || none_of(Stores, [&LoadedValues](StoreInst *SI) {
4941-
return LoadedValues.contains(SI->getOperand(0));
4946+
if (BestUC == 1 || none_of(Stores, [&LoadedValuesPlus](StoreInst *SI) {
4947+
return LoadedValuesPlus.contains(SI->getOperand(0));
49424948
}))
49434949
return;
49444950

llvm/test/Transforms/LoopUnroll/AArch64/apple-unrolling.ll

Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,204 @@ exit:
165165
ret void
166166
}
167167

168+
define void @load_op_store_loop(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) {
169+
; APPLE-LABEL: define void @load_op_store_loop(
170+
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
171+
; APPLE-NEXT: [[ENTRY:.*]]:
172+
; APPLE-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
173+
; APPLE-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
174+
; APPLE-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
175+
; APPLE-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
176+
; APPLE: [[ENTRY_NEW]]:
177+
; APPLE-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
178+
; APPLE-NEXT: br label %[[LOOP:.*]]
179+
; APPLE: [[LOOP]]:
180+
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
181+
; APPLE-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
182+
; APPLE-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
183+
; APPLE-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
184+
; APPLE-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
185+
; APPLE-NEXT: [[O:%.*]] = fadd float [[L]], [[K]]
186+
; APPLE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
187+
; APPLE-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
188+
; APPLE-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
189+
; APPLE-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]]
190+
; APPLE-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
191+
; APPLE-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
192+
; APPLE-NEXT: [[O_1:%.*]] = fadd float [[L_1]], [[K]]
193+
; APPLE-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]]
194+
; APPLE-NEXT: store float [[O_1]], ptr [[GEP_DST_1]], align 4
195+
; APPLE-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
196+
; APPLE-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
197+
; APPLE-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
198+
; APPLE-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
199+
; APPLE: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
200+
; APPLE-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
201+
; APPLE-NEXT: br label %[[EXIT_UNR_LCSSA]]
202+
; APPLE: [[EXIT_UNR_LCSSA]]:
203+
; APPLE-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
204+
; APPLE-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
205+
; APPLE-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
206+
; APPLE: [[LOOP_EPIL_PREHEADER]]:
207+
; APPLE-NEXT: br label %[[LOOP_EPIL:.*]]
208+
; APPLE: [[LOOP_EPIL]]:
209+
; APPLE-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]]
210+
; APPLE-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
211+
; APPLE-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
212+
; APPLE-NEXT: [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]]
213+
; APPLE-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]]
214+
; APPLE-NEXT: store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4
215+
; APPLE-NEXT: br label %[[EXIT]]
216+
; APPLE: [[EXIT]]:
217+
; APPLE-NEXT: ret void
218+
;
219+
; OTHER-LABEL: define void @load_op_store_loop(
220+
; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
221+
; OTHER-NEXT: [[ENTRY:.*]]:
222+
; OTHER-NEXT: [[TMP0:%.*]] = add i64 [[N]], -1
223+
; OTHER-NEXT: [[XTRAITER:%.*]] = and i64 [[N]], 1
224+
; OTHER-NEXT: [[TMP1:%.*]] = icmp ult i64 [[TMP0]], 1
225+
; OTHER-NEXT: br i1 [[TMP1]], label %[[EXIT_UNR_LCSSA:.*]], label %[[ENTRY_NEW:.*]]
226+
; OTHER: [[ENTRY_NEW]]:
227+
; OTHER-NEXT: [[UNROLL_ITER:%.*]] = sub i64 [[N]], [[XTRAITER]]
228+
; OTHER-NEXT: br label %[[LOOP:.*]]
229+
; OTHER: [[LOOP]]:
230+
; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[IV_NEXT_1:%.*]], %[[LOOP]] ]
231+
; OTHER-NEXT: [[NITER:%.*]] = phi i64 [ 0, %[[ENTRY_NEW]] ], [ [[NITER_NEXT_1:%.*]], %[[LOOP]] ]
232+
; OTHER-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
233+
; OTHER-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
234+
; OTHER-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
235+
; OTHER-NEXT: [[O:%.*]] = fadd float [[L]], [[K]]
236+
; OTHER-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
237+
; OTHER-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
238+
; OTHER-NEXT: [[IV_NEXT:%.*]] = add nuw nsw i64 [[IV]], 1
239+
; OTHER-NEXT: [[SCALED_IV_1:%.*]] = mul nuw nsw i64 [[IV_NEXT]], [[SCALE]]
240+
; OTHER-NEXT: [[GEP_SRC_1:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_1]]
241+
; OTHER-NEXT: [[L_1:%.*]] = load float, ptr [[GEP_SRC_1]], align 4
242+
; OTHER-NEXT: [[O_1:%.*]] = fadd float [[L_1]], [[K]]
243+
; OTHER-NEXT: [[GEP_DST_1:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_NEXT]]
244+
; OTHER-NEXT: store float [[O_1]], ptr [[GEP_DST_1]], align 4
245+
; OTHER-NEXT: [[IV_NEXT_1]] = add nuw nsw i64 [[IV]], 2
246+
; OTHER-NEXT: [[NITER_NEXT_1]] = add i64 [[NITER]], 2
247+
; OTHER-NEXT: [[NITER_NCMP_1:%.*]] = icmp eq i64 [[NITER_NEXT_1]], [[UNROLL_ITER]]
248+
; OTHER-NEXT: br i1 [[NITER_NCMP_1]], label %[[EXIT_UNR_LCSSA_LOOPEXIT:.*]], label %[[LOOP]]
249+
; OTHER: [[EXIT_UNR_LCSSA_LOOPEXIT]]:
250+
; OTHER-NEXT: [[IV_UNR_PH:%.*]] = phi i64 [ [[IV_NEXT_1]], %[[LOOP]] ]
251+
; OTHER-NEXT: br label %[[EXIT_UNR_LCSSA]]
252+
; OTHER: [[EXIT_UNR_LCSSA]]:
253+
; OTHER-NEXT: [[IV_UNR:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_UNR_PH]], %[[EXIT_UNR_LCSSA_LOOPEXIT]] ]
254+
; OTHER-NEXT: [[LCMP_MOD:%.*]] = icmp ne i64 [[XTRAITER]], 0
255+
; OTHER-NEXT: br i1 [[LCMP_MOD]], label %[[LOOP_EPIL_PREHEADER:.*]], label %[[EXIT:.*]]
256+
; OTHER: [[LOOP_EPIL_PREHEADER]]:
257+
; OTHER-NEXT: br label %[[LOOP_EPIL:.*]]
258+
; OTHER: [[LOOP_EPIL]]:
259+
; OTHER-NEXT: [[SCALED_IV_EPIL:%.*]] = mul nuw nsw i64 [[IV_UNR]], [[SCALE]]
260+
; OTHER-NEXT: [[GEP_SRC_EPIL:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV_EPIL]]
261+
; OTHER-NEXT: [[L_EPIL:%.*]] = load float, ptr [[GEP_SRC_EPIL]], align 4
262+
; OTHER-NEXT: [[O_EPIL:%.*]] = fadd float [[L_EPIL]], [[K]]
263+
; OTHER-NEXT: [[GEP_DST_EPIL:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV_UNR]]
264+
; OTHER-NEXT: store float [[O_EPIL]], ptr [[GEP_DST_EPIL]], align 4
265+
; OTHER-NEXT: br label %[[EXIT]]
266+
; OTHER: [[EXIT]]:
267+
; OTHER-NEXT: ret void
268+
;
269+
entry:
270+
br label %loop
271+
272+
loop:
273+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
274+
%scaled.iv = mul nuw nsw i64 %iv, %scale
275+
%gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv
276+
%l = load float, ptr %gep.src, align 4
277+
%o = fadd float %l, %k
278+
%gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv
279+
store float %o, ptr %gep.dst, align 4
280+
%iv.next = add nuw nsw i64 %iv, 1
281+
%ec = icmp eq i64 %iv.next, %N
282+
br i1 %ec, label %exit, label %loop
283+
284+
exit:
285+
ret void
286+
}
287+
288+
define void @load_op_store_loop_multiblock(ptr %src, ptr %dst, i64 %N, i64 %scale, float %k) {
289+
; APPLE-LABEL: define void @load_op_store_loop_multiblock(
290+
; APPLE-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
291+
; APPLE-NEXT: [[ENTRY:.*]]:
292+
; APPLE-NEXT: br label %[[LOOP:.*]]
293+
; APPLE: [[LOOP]]:
294+
; APPLE-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ]
295+
; APPLE-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
296+
; APPLE-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
297+
; APPLE-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
298+
; APPLE-NEXT: [[AND:%.*]] = and i64 [[IV]], 1
299+
; APPLE-NEXT: [[ODD:%.*]] = icmp eq i64 [[AND]], 1
300+
; APPLE-NEXT: br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]]
301+
; APPLE: [[LOOPCONT]]:
302+
; APPLE-NEXT: [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ]
303+
; APPLE-NEXT: [[O:%.*]] = fadd float [[D]], [[K]]
304+
; APPLE-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
305+
; APPLE-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
306+
; APPLE-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
307+
; APPLE-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
308+
; APPLE-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
309+
; APPLE: [[LOOPODD]]:
310+
; APPLE-NEXT: [[L2]] = fneg float [[L]]
311+
; APPLE-NEXT: br label %[[LOOPCONT]]
312+
; APPLE: [[EXIT]]:
313+
; APPLE-NEXT: ret void
314+
;
315+
; OTHER-LABEL: define void @load_op_store_loop_multiblock(
316+
; OTHER-SAME: ptr [[SRC:%.*]], ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[SCALE:%.*]], float [[K:%.*]]) #[[ATTR0]] {
317+
; OTHER-NEXT: [[ENTRY:.*]]:
318+
; OTHER-NEXT: br label %[[LOOP:.*]]
319+
; OTHER: [[LOOP]]:
320+
; OTHER-NEXT: [[IV:%.*]] = phi i64 [ 0, %[[ENTRY]] ], [ [[IV_NEXT:%.*]], %[[LOOPCONT:.*]] ]
321+
; OTHER-NEXT: [[SCALED_IV:%.*]] = mul nuw nsw i64 [[IV]], [[SCALE]]
322+
; OTHER-NEXT: [[GEP_SRC:%.*]] = getelementptr inbounds float, ptr [[SRC]], i64 [[SCALED_IV]]
323+
; OTHER-NEXT: [[L:%.*]] = load float, ptr [[GEP_SRC]], align 4
324+
; OTHER-NEXT: [[AND:%.*]] = and i64 [[IV]], 1
325+
; OTHER-NEXT: [[ODD:%.*]] = icmp eq i64 [[AND]], 1
326+
; OTHER-NEXT: br i1 [[ODD]], label %[[LOOPODD:.*]], label %[[LOOPCONT]]
327+
; OTHER: [[LOOPCONT]]:
328+
; OTHER-NEXT: [[D:%.*]] = phi float [ [[L2:%.*]], %[[LOOPODD]] ], [ [[L]], %[[LOOP]] ]
329+
; OTHER-NEXT: [[O:%.*]] = fadd float [[D]], [[K]]
330+
; OTHER-NEXT: [[GEP_DST:%.*]] = getelementptr inbounds float, ptr [[DST]], i64 [[IV]]
331+
; OTHER-NEXT: store float [[O]], ptr [[GEP_DST]], align 4
332+
; OTHER-NEXT: [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
333+
; OTHER-NEXT: [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
334+
; OTHER-NEXT: br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]]
335+
; OTHER: [[LOOPODD]]:
336+
; OTHER-NEXT: [[L2]] = fneg float [[L]]
337+
; OTHER-NEXT: br label %[[LOOPCONT]]
338+
; OTHER: [[EXIT]]:
339+
; OTHER-NEXT: ret void
340+
;
341+
entry:
342+
br label %loop
343+
loop:
344+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %loopcont ]
345+
%scaled.iv = mul nuw nsw i64 %iv, %scale
346+
%gep.src = getelementptr inbounds float, ptr %src, i64 %scaled.iv
347+
%l1 = load float, ptr %gep.src, align 4
348+
%and = and i64 %iv, 1
349+
%odd = icmp eq i64 %and, 1
350+
br i1 %odd, label %loopodd, label %loopcont
351+
loopcont:
352+
%d = phi float [ %l2, %loopodd ], [ %l1, %loop]
353+
%o = fadd float %d, %k
354+
%gep.dst = getelementptr inbounds float, ptr %dst, i64 %iv
355+
store float %o, ptr %gep.dst, align 4
356+
%iv.next = add nuw nsw i64 %iv, 1
357+
%ec = icmp eq i64 %iv.next, %N
358+
br i1 %ec, label %exit, label %loop
359+
loopodd:
360+
%l2 = fneg float %l1
361+
br label %loopcont
362+
exit:
363+
ret void
364+
}
365+
168366
@A = external constant [9 x i8], align 1
169367
@B = external constant [8 x i32], align 4
170368
@C = external constant [8 x i32], align 4

0 commit comments

Comments
 (0)