Skip to content

Commit 189b639

Browse files
committed
!fixup fix profile info
1 parent c9228c1 commit 189b639

File tree

7 files changed

+85
-130
lines changed

7 files changed

+85
-130
lines changed

llvm/include/llvm/Transforms/Utils/LoopUtils.h

Lines changed: 0 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -508,23 +508,6 @@ LLVM_ABI int rewriteLoopExitValues(Loop *L, LoopInfo *LI,
508508
ReplaceExitVal ReplaceExitValue,
509509
SmallVector<WeakTrackingVH, 16> &DeadInsts);
510510

511-
/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
512-
/// \p OrigLoop and the following distribution of \p OrigLoop iteration among \p
513-
/// UnrolledLoop and \p RemainderLoop. \p UnrolledLoop receives weights that
514-
/// reflect TC/UF iterations, and \p RemainderLoop receives weights that reflect
515-
/// the remaining TC%UF iterations.
516-
///
517-
/// Note that \p OrigLoop may be equal to either \p UnrolledLoop or \p
518-
/// RemainderLoop in which case weights for \p OrigLoop are updated accordingly.
519-
/// Note also behavior is undefined if \p UnrolledLoop and \p RemainderLoop are
520-
/// equal. \p UF must be greater than zero.
521-
/// If \p OrigLoop has no profile info associated nothing happens.
522-
///
523-
/// This utility may be useful for such optimizations as unroller and
524-
/// vectorizer as it's typical transformation for them.
525-
LLVM_ABI void setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
526-
Loop *RemainderLoop, uint64_t UF);
527-
528511
/// Utility that implements appending of loops onto a worklist given a range.
529512
/// We want to process loops in postorder, but the worklist is a LIFO data
530513
/// structure, so we append to it in *reverse* postorder.

llvm/lib/Transforms/Utils/LoopUtils.cpp

Lines changed: 0 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1781,32 +1781,6 @@ int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
17811781
return NumReplaced;
17821782
}
17831783

1784-
/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
1785-
/// \p OrigLoop.
1786-
void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
1787-
Loop *RemainderLoop, uint64_t UF) {
1788-
assert(UF > 0 && "Zero unrolled factor is not supported");
1789-
assert(UnrolledLoop != RemainderLoop &&
1790-
"Unrolled and Remainder loops are expected to distinct");
1791-
1792-
// Get number of iterations in the original scalar loop.
1793-
unsigned OrigLoopInvocationWeight = 0;
1794-
std::optional<unsigned> OrigAverageTripCount =
1795-
getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
1796-
if (!OrigAverageTripCount)
1797-
return;
1798-
1799-
// Calculate number of iterations in unrolled loop.
1800-
unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
1801-
// Calculate number of iterations for remainder loop.
1802-
unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
1803-
1804-
setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
1805-
OrigLoopInvocationWeight);
1806-
setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
1807-
OrigLoopInvocationWeight);
1808-
}
1809-
18101784
/// Utility that implements appending of loops onto a worklist.
18111785
/// Loops are added in preorder (analogous for reverse postorder for trees),
18121786
/// and the worklist is processed LIFO.

llvm/lib/Transforms/Vectorize/LoopVectorize.cpp

Lines changed: 35 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -2573,26 +2573,6 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State) {
25732573
// Remove redundant induction instructions.
25742574
cse(HeaderBB);
25752575

2576-
if (!Plan.getScalarPreheader()->hasPredecessors())
2577-
return;
2578-
2579-
// Set/update profile weights for the vector and remainder loops as original
2580-
// loop iterations are now distributed among them. Note that original loop
2581-
// becomes the scalar remainder loop after vectorization.
2582-
//
2583-
// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
2584-
// end up getting slightly roughened result but that should be OK since
2585-
// profile is not inherently precise anyway. Note also possible bypass of
2586-
// vector code caused by legality checks is ignored, assigning all the weight
2587-
// to the vector loop, optimistically.
2588-
//
2589-
// For scalable vectorization we can't know at compile time how many
2590-
// iterations of the loop are handled in one vector iteration, so instead
2591-
// use the value of vscale used for tuning.
2592-
Loop *VectorLoop = LI->getLoopFor(HeaderBB);
2593-
unsigned EstimatedVFxUF =
2594-
estimateElementCount(VF * UF, Cost->getVScaleForTuning());
2595-
setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop, EstimatedVFxUF);
25962576
}
25972577

25982578
void InnerLoopVectorizer::fixNonInductionPHIs(VPTransformState &State) {
@@ -7325,6 +7305,9 @@ MDNode *LID = OrigLoop->getLoopID();
73257305
std::optional<MDNode *> VectorizedLoopID = makeFollowupLoopID(
73267306
LID,
73277307
{LLVMLoopVectorizeFollowupAll, LLVMLoopVectorizeFollowupVectorized});
7308+
unsigned OrigLoopInvocationWeight = 0;
7309+
std::optional<unsigned> OrigAverageTripCount =
7310+
getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
73287311

73297312
BestVPlan.execute(&State);
73307313

@@ -7396,6 +7379,38 @@ MDNode *LID = OrigLoop->getLoopID();
73967379
addRuntimeUnrollDisableMetaData(L);
73977380
}
73987381

7382+
// Set/update profile weights for the vector and remainder loops as original
7383+
// loop iterations are now distributed among them. Note that original loop
7384+
// becomes the scalar remainder loop after vectorization.
7385+
//
7386+
// For cases like foldTailByMasking() and requiresScalarEpiloque() we may
7387+
// end up getting slightly roughened result but that should be OK since
7388+
// profile is not inherently precise anyway. Note also possible bypass of
7389+
// vector code caused by legality checks is ignored, assigning all the weight
7390+
// to the vector loop, optimistically.
7391+
//
7392+
// For scalable vectorization we can't know at compile time how many
7393+
// iterations of the loop are handled in one vector iteration, so instead
7394+
// use the value of vscale used for tuning.
7395+
if (OrigAverageTripCount) {
7396+
unsigned EstimatedVFxUF =
7397+
estimateElementCount(BestVF * BestUF, CM.getVScaleForTuning());
7398+
// Calculate number of iterations in unrolled loop.
7399+
unsigned AverageVectorTripCount = *OrigAverageTripCount / EstimatedVFxUF;
7400+
// Calculate number of iterations for remainder loop.
7401+
unsigned RemainderAverageTripCount = *OrigAverageTripCount % EstimatedVFxUF;
7402+
7403+
if (HeaderVPBB) {
7404+
Loop *VectorLoop = LI->getLoopFor(State.CFG.VPBB2IRBB[HeaderVPBB]);
7405+
setLoopEstimatedTripCount(VectorLoop, AverageVectorTripCount,
7406+
OrigLoopInvocationWeight);
7407+
}
7408+
if (BestVPlan.getScalarPreheader()->hasPredecessors()) {
7409+
setLoopEstimatedTripCount(OrigLoop, RemainderAverageTripCount,
7410+
OrigLoopInvocationWeight);
7411+
}
7412+
}
7413+
73997414
// 3. Fix the vectorized code: take care of header phi's, live-outs,
74007415
// predication, updating analyses.
74017416
ILV.fixVectorizedLoop(State);

llvm/test/Transforms/LoopVectorize/X86/pr81872.ll

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -37,16 +37,16 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
3737
; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
3838
; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i64> [[VEC_IND]], splat (i64 -4)
3939
; CHECK-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 12
40-
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
40+
; CHECK-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
4141
; CHECK: middle.block:
4242
; CHECK-NEXT: br label [[BB6:%.*]]
4343
; CHECK: scalar.ph:
4444
; CHECK-NEXT: br label [[LOOP_HEADER:%.*]]
4545
; CHECK: loop.header:
46-
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 99, [[SCALAR_PH:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
46+
; CHECK-NEXT: [[IV:%.*]] = phi i64 [ 99, [[SCALAR_PH:%.+]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
4747
; CHECK-NEXT: [[AND:%.*]] = and i64 [[IV]], 1
4848
; CHECK-NEXT: [[ICMP17:%.*]] = icmp eq i64 [[AND]], 0
49-
; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF3:![0-9]+]]
49+
; CHECK-NEXT: br i1 [[ICMP17]], label [[BB18:%.*]], label [[LOOP_LATCH]], !prof [[PROF5:![0-9]+]]
5050
; CHECK: bb18:
5151
; CHECK-NEXT: [[OR:%.*]] = or disjoint i64 [[IV]], 1
5252
; CHECK-NEXT: [[GETELEMENTPTR19:%.*]] = getelementptr inbounds i64, ptr [[ARR]], i64 [[OR]]
@@ -55,7 +55,7 @@ define void @test(ptr noundef align 8 dereferenceable_or_null(16) %arr) #0 {
5555
; CHECK: loop.latch:
5656
; CHECK-NEXT: [[IV_NEXT]] = add nsw i64 [[IV]], -1
5757
; CHECK-NEXT: [[ICMP22:%.*]] = icmp eq i64 [[IV_NEXT]], 90
58-
; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF4:![0-9]+]]
58+
; CHECK-NEXT: br i1 [[ICMP22]], label [[BB6]], label [[LOOP_HEADER]], !prof [[PROF6:![0-9]+]]
5959
; CHECK: bb6:
6060
; CHECK-NEXT: ret void
6161
;
@@ -94,9 +94,10 @@ attributes #0 = {"target-cpu"="haswell" "target-features"="+avx2" }
9494

9595

9696
;.
97-
; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
98-
; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
99-
; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
100-
; CHECK: [[PROF3]] = !{!"branch_weights", i32 1, i32 1}
101-
; CHECK: [[PROF4]] = !{!"branch_weights", i32 1, i32 95}
97+
; CHECK: [[PROF1]] = !{!"branch_weights", i32 1, i32 23}
98+
; CHECK: [[LOOP2]] = distinct !{[[LOOP2]], [[META3:![0-9]+]], [[META4:![0-9]+]]}
99+
; CHECK: [[META3]] = !{!"llvm.loop.isvectorized", i32 1}
100+
; CHECK: [[META4]] = !{!"llvm.loop.unroll.runtime.disable"}
101+
; CHECK: [[PROF5]] = !{!"branch_weights", i32 1, i32 1}
102+
; CHECK: [[PROF6]] = !{!"branch_weights", i32 1, i32 95}
102103
;.

llvm/test/Transforms/LoopVectorize/check-prof-info.ll

Lines changed: 12 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -17,29 +17,29 @@ define void @_Z3foov() {
1717
; CHECK: vector.ph:
1818
; CHECK: br label [[VECTOR_BODY:%.*]]
1919
; CHECK: vector.body:
20-
; CHECK: br i1 [[TMP4:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
20+
; CHECK: br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
2121
; CHECK: middle.block:
2222
; CHECK: br label [[FOR_COND_CLEANUP:%.*]]
2323
; CHECK: scalar.ph:
2424
; CHECK: br label [[FOR_BODY:%.*]]
2525
; CHECK: for.cond.cleanup:
2626
; CHECK: for.body:
27-
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF3:![0-9]+]]
27+
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF5:![0-9]+]]
2828
;
2929
; CHECK-MASKED-LABEL: @_Z3foov(
3030
; CHECK-MASKED: entry:
3131
; CHECK-MASKED: br label [[VECTOR_PH:%.*]]
3232
; CHECK-MASKED: vector.ph:
3333
; CHECK-MASKED: br label [[VECTOR_BODY:%.*]]
3434
; CHECK-MASKED: vector.body:
35-
; CHECK-MASKED: br i1 [[TMP19:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
35+
; CHECK-MASKED: br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
3636
; CHECK-MASKED: middle.block:
3737
; CHECK-MASKED: br label [[FOR_COND_CLEANUP:%.*]]
3838
; CHECK-MASKED: scalar.ph:
3939
; CHECK-MASKED: br label [[FOR_BODY:%.*]]
4040
; CHECK-MASKED: for.cond.cleanup:
4141
; CHECK-MASKED: for.body:
42-
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF3:![0-9]+]]
42+
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !prof [[PROF5:![0-9]+]]
4343
;
4444
; CHECK-SCALABLE-LABEL: @_Z3foov(
4545
; CHECK-SCALABLE: entry:
@@ -50,7 +50,7 @@ define void @_Z3foov() {
5050
; CHECK-SCALABLE: br label [[VECTOR_BODY:%.*]]
5151
; CHECK-SCALABLE: vector.body:
5252
; CHECK-SCALABLE: [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
53-
; CHECK-SCALABLE: br i1 [[TMP12:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
53+
; CHECK-SCALABLE: br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1:![0-9]+]], !llvm.loop [[LOOP2:![0-9]+]]
5454
; CHECK-SCALABLE: middle.block:
5555
; CHECK-SCALABLE: br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5:![0-9]+]]
5656
; CHECK-SCALABLE: scalar.ph:
@@ -88,29 +88,29 @@ define void @_Z3foo2v() {
8888
; CHECK: vector.ph:
8989
; CHECK: br label [[VECTOR_BODY:%.*]]
9090
; CHECK: vector.body:
91-
; CHECK: br i1 [[TMP4:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF4:![0-9]+]], !llvm.loop [[LOOP5:![0-9]+]]
91+
; CHECK: br i1 [[TMP6:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP7:![0-9]+]]
9292
; CHECK: middle.block:
93-
; CHECK: br label [[SCALAR_PH:%.*]]
93+
; CHECK: br label [[SCALAR_PH:%.+]]
9494
; CHECK: scalar.ph:
9595
; CHECK: br label [[FOR_BODY:%.*]]
9696
; CHECK: for.cond.cleanup:
9797
; CHECK: for.body:
98-
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
98+
; CHECK: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
9999
;
100100
; CHECK-MASKED-LABEL: @_Z3foo2v(
101101
; CHECK-MASKED: entry:
102102
; CHECK-MASKED: br label [[VECTOR_PH:%.*]]
103103
; CHECK-MASKED: vector.ph:
104104
; CHECK-MASKED: br label [[VECTOR_BODY:%.*]]
105105
; CHECK-MASKED: vector.body:
106-
; CHECK-MASKED: br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF4:![0-9]+]], !llvm.loop [[LOOP5:![0-9]+]]
106+
; CHECK-MASKED: br i1 [[TMP18:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP7:![0-9]+]]
107107
; CHECK-MASKED: middle.block:
108-
; CHECK-MASKED: br label [[SCALAR_PH:%.*]]
108+
; CHECK-MASKED: br label [[SCALAR_PH:%.+]]
109109
; CHECK-MASKED: scalar.ph:
110110
; CHECK-MASKED: br label [[FOR_BODY:%.*]]
111111
; CHECK-MASKED: for.cond.cleanup:
112112
; CHECK-MASKED: for.body:
113-
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF6:![0-9]+]], !llvm.loop [[LOOP7:![0-9]+]]
113+
; CHECK-MASKED: br i1 [[EXITCOND:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !prof [[PROF8:![0-9]+]], !llvm.loop [[LOOP9:![0-9]+]]
114114
;
115115
; CHECK-SCALABLE-LABEL: @_Z3foo2v(
116116
; CHECK-SCALABLE: entry:
@@ -121,7 +121,7 @@ define void @_Z3foo2v() {
121121
; CHECK-SCALABLE: br label [[VECTOR_BODY:%.*]]
122122
; CHECK-SCALABLE: vector.body:
123123
; CHECK-SCALABLE: [[VEC_IND_NEXT:%.*]] = add <vscale x 4 x i32> [[VEC_IND:%.*]], [[BROADCAST_SPLAT]]
124-
; CHECK-SCALABLE: br i1 [[TMP12:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
124+
; CHECK-SCALABLE: br i1 [[TMP16:%.*]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !prof [[PROF1]], !llvm.loop [[LOOP8:![0-9]+]]
125125
; CHECK-SCALABLE: middle.block:
126126
; CHECK-SCALABLE: br i1 [[CMP_N:%.*]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]], !prof [[PROF5]]
127127
; CHECK-SCALABLE: scalar.ph:

0 commit comments

Comments
 (0)