Skip to content

Commit 5bf5df1

Browse files
david-armgithub-actions[bot]
authored andcommitted
Automerge: [LV] Always emit branch weights for vector epilogue (#155437)
We currently only emit the branch weights for the epilogue iteration count check if there was already branch weight data for the scalar loop. However, the code makes no use of the existing branch weight when estimating the likelihood of taking a particular branch and so we can just always add the branch weights regardless. These hints should hopefully improve code generation.
2 parents 9692c57 + e867b85 commit 5bf5df1

File tree

2 files changed

+127
-15
lines changed

2 files changed

+127
-15
lines changed

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -7547,21 +7547,21 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
75477547

75487548
BranchInst &BI =
75497549
*BranchInst::Create(Bypass, LoopVectorPreHeader, CheckMinIters);
7550-
if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) {
7551-
auto VScale = Cost->getVScaleForTuning();
7552-
unsigned MainLoopStep =
7553-
estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
7554-
unsigned EpilogueLoopStep =
7555-
estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
7556-
// We assume the remaining `Count` is equally distributed in
7557-
// [0, MainLoopStep)
7558-
// So the probability for `Count < EpilogueLoopStep` should be
7559-
// min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7560-
unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7561-
const uint32_t Weights[] = {EstimatedSkipCount,
7562-
MainLoopStep - EstimatedSkipCount};
7563-
setBranchWeights(BI, Weights, /*IsExpected=*/false);
7564-
}
7550+
auto VScale = Cost->getVScaleForTuning();
7551+
unsigned MainLoopStep =
7552+
estimateElementCount(EPI.MainLoopVF * EPI.MainLoopUF, VScale);
7553+
unsigned EpilogueLoopStep =
7554+
estimateElementCount(EPI.EpilogueVF * EPI.EpilogueUF, VScale);
7555+
// We assume the remaining `Count` is equally distributed in
7556+
// [0, MainLoopStep)
7557+
// So the probability for `Count < EpilogueLoopStep` should be
7558+
// min(MainLoopStep, EpilogueLoopStep) / MainLoopStep
7559+
// TODO: Improve the estimate by taking the estimated trip count into
7560+
// consideration.
7561+
unsigned EstimatedSkipCount = std::min(MainLoopStep, EpilogueLoopStep);
7562+
const uint32_t Weights[] = {EstimatedSkipCount,
7563+
MainLoopStep - EstimatedSkipCount};
7564+
setBranchWeights(BI, Weights, /*IsExpected=*/false);
75657565
ReplaceInstWithInst(Insert->getTerminator(), &BI);
75667566

75677567
// A new entry block has been created for the epilogue VPlan. Hook it in, as

llvm/test/Transforms/LoopVectorize/AArch64/check-prof-info.ll

Lines changed: 112 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -236,6 +236,108 @@ for.cond.cleanup:
236236
ret void
237237
}
238238

239+
define void @foo_i32_no_bw(i64 %n) {
240+
; CHECK-V1-IC1-LABEL: define void @foo_i32_no_bw(
241+
; CHECK-V1-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
242+
; CHECK-V1-IC1: [[ENTRY:.*:]]
243+
; CHECK-V1-IC1: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
244+
; CHECK-V1-IC1: [[VECTOR_PH]]:
245+
; CHECK-V1-IC1: br label %[[VECTOR_BODY:.*]]
246+
; CHECK-V1-IC1: [[VECTOR_BODY]]:
247+
; CHECK-V1-IC1: br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
248+
; CHECK-V1-IC1: [[MIDDLE_BLOCK]]:
249+
; CHECK-V1-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
250+
; CHECK-V1-IC1: [[SCALAR_PH]]:
251+
; CHECK-V1-IC1: br label %[[FOR_BODY:.*]]
252+
; CHECK-V1-IC1: [[FOR_BODY]]:
253+
; CHECK-V1-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
254+
; CHECK-V1-IC1: [[FOR_COND_CLEANUP]]:
255+
;
256+
; CHECK-V1-IC1-FORCE-EPI4-LABEL: define void @foo_i32_no_bw(
257+
; CHECK-V1-IC1-FORCE-EPI4-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
258+
; CHECK-V1-IC1-FORCE-EPI4: [[ITER_CHECK:.*:]]
259+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
260+
; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
261+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
262+
; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_PH]]:
263+
; CHECK-V1-IC1-FORCE-EPI4: br label %[[VECTOR_BODY:.*]]
264+
; CHECK-V1-IC1-FORCE-EPI4: [[VECTOR_BODY]]:
265+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[TMP6:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
266+
; CHECK-V1-IC1-FORCE-EPI4: [[MIDDLE_BLOCK]]:
267+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
268+
; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_ITER_CHECK]]:
269+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF5]]
270+
; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_PH]]:
271+
; CHECK-V1-IC1-FORCE-EPI4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
272+
; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_VECTOR_BODY]]:
273+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[TMP9:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
274+
; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
275+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[CMP_N7:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]]
276+
; CHECK-V1-IC1-FORCE-EPI4: [[VEC_EPILOG_SCALAR_PH]]:
277+
; CHECK-V1-IC1-FORCE-EPI4: br label %[[FOR_BODY:.*]]
278+
; CHECK-V1-IC1-FORCE-EPI4: [[FOR_BODY]]:
279+
; CHECK-V1-IC1-FORCE-EPI4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
280+
; CHECK-V1-IC1-FORCE-EPI4: [[FOR_COND_CLEANUP]]:
281+
;
282+
; CHECK-V2-IC1-LABEL: define void @foo_i32_no_bw(
283+
; CHECK-V2-IC1-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
284+
; CHECK-V2-IC1: [[ENTRY:.*:]]
285+
; CHECK-V2-IC1: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
286+
; CHECK-V2-IC1: [[VECTOR_PH]]:
287+
; CHECK-V2-IC1: br label %[[VECTOR_BODY:.*]]
288+
; CHECK-V2-IC1: [[VECTOR_BODY]]:
289+
; CHECK-V2-IC1: br i1 [[TMP2:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
290+
; CHECK-V2-IC1: [[MIDDLE_BLOCK]]:
291+
; CHECK-V2-IC1: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
292+
; CHECK-V2-IC1: [[SCALAR_PH]]:
293+
; CHECK-V2-IC1: br label %[[FOR_BODY:.*]]
294+
; CHECK-V2-IC1: [[FOR_BODY]]:
295+
; CHECK-V2-IC1: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
296+
; CHECK-V2-IC1: [[FOR_COND_CLEANUP]]:
297+
;
298+
; CHECK-V2-IC4-LABEL: define void @foo_i32_no_bw(
299+
; CHECK-V2-IC4-SAME: i64 [[N:%.*]]) #[[ATTR0]] {
300+
; CHECK-V2-IC4: [[ITER_CHECK:.*:]]
301+
; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
302+
; CHECK-V2-IC4: [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
303+
; CHECK-V2-IC4: br i1 [[MIN_ITERS_CHECK1:%.*]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
304+
; CHECK-V2-IC4: [[VECTOR_PH]]:
305+
; CHECK-V2-IC4: br label %[[VECTOR_BODY:.*]]
306+
; CHECK-V2-IC4: [[VECTOR_BODY]]:
307+
; CHECK-V2-IC4: br i1 [[TMP8:%.*]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
308+
; CHECK-V2-IC4: [[MIDDLE_BLOCK]]:
309+
; CHECK-V2-IC4: br i1 [[CMP_N:%.*]], label %[[FOR_COND_CLEANUP:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
310+
; CHECK-V2-IC4: [[VEC_EPILOG_ITER_CHECK]]:
311+
; CHECK-V2-IC4: br i1 [[MIN_EPILOG_ITERS_CHECK:%.*]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]], !prof [[PROF6]]
312+
; CHECK-V2-IC4: [[VEC_EPILOG_PH]]:
313+
; CHECK-V2-IC4: br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
314+
; CHECK-V2-IC4: [[VEC_EPILOG_VECTOR_BODY]]:
315+
; CHECK-V2-IC4: br i1 [[TMP11:%.*]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
316+
; CHECK-V2-IC4: [[VEC_EPILOG_MIDDLE_BLOCK]]:
317+
; CHECK-V2-IC4: br i1 [[CMP_N10:%.*]], label %[[FOR_COND_CLEANUP]], label %[[VEC_EPILOG_SCALAR_PH]]
318+
; CHECK-V2-IC4: [[VEC_EPILOG_SCALAR_PH]]:
319+
; CHECK-V2-IC4: br label %[[FOR_BODY:.*]]
320+
; CHECK-V2-IC4: [[FOR_BODY]]:
321+
; CHECK-V2-IC4: br i1 [[EXITCOND:%.*]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
322+
; CHECK-V2-IC4: [[FOR_COND_CLEANUP]]:
323+
;
324+
entry:
325+
br label %for.body
326+
327+
for.body: ; preds = %for.body, %entry
328+
%iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
329+
%arrayidx = getelementptr inbounds [1024 x i32], ptr @b, i64 0, i64 %iv
330+
%load = load i32, ptr %arrayidx, align 4
331+
%arrayidx2 = getelementptr inbounds [1024 x i32], ptr @a, i64 0, i64 %iv
332+
store i32 %load, ptr %arrayidx2, align 4
333+
%iv.next = add nuw nsw i64 %iv, 1
334+
%exitcond = icmp eq i64 %iv.next, %n
335+
br i1 %exitcond, label %for.cond.cleanup, label %for.body
336+
337+
for.cond.cleanup: ; preds = %for.body
338+
ret void
339+
}
340+
239341
!0 = !{!"branch_weights", i32 1, i32 1023}
240342
;.
241343
; CHECK-V1-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
@@ -251,6 +353,8 @@ for.cond.cleanup:
251353
; CHECK-V1-IC1: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META3]]}
252354
; CHECK-V1-IC1: [[PROF11]] = !{!"branch_weights", i32 1, i32 15}
253355
; CHECK-V1-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META2]]}
356+
; CHECK-V1-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
357+
; CHECK-V1-IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
254358
;.
255359
; CHECK-V1-IC1-FORCE-EPI4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
256360
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP1]] = distinct !{[[LOOP1]], [[META2:![0-9]+]], [[META3:![0-9]+]]}
@@ -267,6 +371,9 @@ for.cond.cleanup:
267371
; CHECK-V1-IC1-FORCE-EPI4: [[PROF12]] = !{!"branch_weights", i32 4, i32 28}
268372
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META3]]}
269373
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META2]]}
374+
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META3]]}
375+
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META3]]}
376+
; CHECK-V1-IC1-FORCE-EPI4: [[LOOP17]] = distinct !{[[LOOP17]], [[META3]], [[META2]]}
270377
;.
271378
; CHECK-V2-IC1: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
272379
; CHECK-V2-IC1: [[PROF1]] = !{!"branch_weights", i32 1, i32 255}
@@ -282,6 +389,8 @@ for.cond.cleanup:
282389
; CHECK-V2-IC1: [[PROF11]] = !{!"branch_weights", i32 4, i32 12}
283390
; CHECK-V2-IC1: [[LOOP12]] = distinct !{[[LOOP12]], [[META3]], [[META4]]}
284391
; CHECK-V2-IC1: [[LOOP13]] = distinct !{[[LOOP13]], [[META4]], [[META3]]}
392+
; CHECK-V2-IC1: [[LOOP14]] = distinct !{[[LOOP14]], [[META3]], [[META4]]}
393+
; CHECK-V2-IC1: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]], [[META3]]}
285394
;.
286395
; CHECK-V2-IC4: [[PROF0]] = !{!"branch_weights", i32 1, i32 127}
287396
; CHECK-V2-IC4: [[PROF1]] = !{!"branch_weights", i32 1, i32 63}
@@ -299,4 +408,7 @@ for.cond.cleanup:
299408
; CHECK-V2-IC4: [[LOOP13]] = distinct !{[[LOOP13]], [[META3]], [[META4]]}
300409
; CHECK-V2-IC4: [[PROF14]] = !{!"branch_weights", i32 1, i32 7}
301410
; CHECK-V2-IC4: [[LOOP15]] = distinct !{[[LOOP15]], [[META4]], [[META3]]}
411+
; CHECK-V2-IC4: [[LOOP16]] = distinct !{[[LOOP16]], [[META3]], [[META4]]}
412+
; CHECK-V2-IC4: [[LOOP17]] = distinct !{[[LOOP17]], [[META3]], [[META4]]}
413+
; CHECK-V2-IC4: [[LOOP18]] = distinct !{[[LOOP18]], [[META4]], [[META3]]}
302414
;.

0 commit comments

Comments
 (0)