Skip to content

Commit 9ac84a6

Browse files
authored
[MergeICmp][profcheck] Propagate profile info (#167594)
Propagate branch weights in `mergeComparisons`​ : the probability of reaching the common "exit" BB (`bb_phi`​ in the description in `processPhi`​)doesn't change, and is a disjunction over the probabilities of doing that from the blocks performing comparisons which are now being merged Issue #147390
1 parent cfc74dd commit 9ac84a6

File tree

5 files changed

+90
-23
lines changed

5 files changed

+90
-23
lines changed

llvm/include/llvm/IR/ProfDataUtils.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -149,6 +149,9 @@ LLVM_ABI bool extractProfTotalWeight(const Instruction &I,
149149
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
150150
bool IsExpected, bool ElideAllZero = false);
151151

152+
/// Push the weights right to fit in uint32_t.
153+
LLVM_ABI SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights);
154+
152155
/// Variant of `setBranchWeights` where the `Weights` will be fit first to
153156
/// uint32_t by shifting right.
154157
LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef<uint64_t> Weights,

llvm/lib/IR/ProfDataUtils.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,7 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
8686
}
8787

8888
/// Push the weights right to fit in uint32_t.
89-
static SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights) {
89+
SmallVector<uint32_t> llvm::fitWeights(ArrayRef<uint64_t> Weights) {
9090
SmallVector<uint32_t> Ret;
9191
Ret.reserve(Weights.size());
9292
uint64_t Max = *llvm::max_element(Weights);

llvm/lib/Transforms/Scalar/MergeICmps.cpp

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,9 @@
5050
#include "llvm/Analysis/TargetTransformInfo.h"
5151
#include "llvm/IR/Dominators.h"
5252
#include "llvm/IR/Function.h"
53-
#include "llvm/IR/Instruction.h"
5453
#include "llvm/IR/IRBuilder.h"
54+
#include "llvm/IR/Instruction.h"
55+
#include "llvm/IR/ProfDataUtils.h"
5556
#include "llvm/InitializePasses.h"
5657
#include "llvm/Pass.h"
5758
#include "llvm/Transforms/Scalar.h"
@@ -66,6 +67,9 @@ using namespace llvm;
6667

6768
#define DEBUG_TYPE "mergeicmps"
6869

70+
namespace llvm {
71+
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
72+
} // namespace llvm
6973
namespace {
7074

7175
// A BCE atom "Binary Compare Expression Atom" represents an integer load
@@ -607,6 +611,37 @@ class MergedBlockName {
607611
};
608612
} // namespace
609613

614+
/// Determine the branch weights for the resulting conditional branch, resulting
615+
/// after merging \p Comparisons.
616+
static std::optional<SmallVector<uint32_t, 2>>
617+
computeMergedBranchWeights(ArrayRef<BCECmpBlock> Comparisons) {
618+
assert(!Comparisons.empty());
619+
if (ProfcheckDisableMetadataFixes)
620+
return std::nullopt;
621+
if (Comparisons.size() == 1) {
622+
SmallVector<uint32_t, 2> Weights;
623+
if (!extractBranchWeights(*Comparisons[0].BB->getTerminator(), Weights))
624+
return std::nullopt;
625+
return Weights;
626+
}
627+
// The probability to go to the phi block is the disjunction of the
628+
// probability to go to the phi block from the individual Comparisons. We'll
629+
// swap the weights because `getDisjunctionWeights` computes the disjunction
630+
// for the "true" branch, then swap back.
631+
SmallVector<uint64_t, 2> Weights{0, 1};
632+
// At this point, Weights encodes "0-probability" for the "true" side.
633+
for (const auto &C : Comparisons) {
634+
SmallVector<uint32_t, 2> W;
635+
if (!extractBranchWeights(*C.BB->getTerminator(), W))
636+
return std::nullopt;
637+
638+
std::swap(W[0], W[1]);
639+
Weights = getDisjunctionWeights(Weights, W);
640+
}
641+
std::swap(Weights[0], Weights[1]);
642+
return fitWeights(Weights);
643+
}
644+
610645
// Merges the given contiguous comparison blocks into one memcmp block.
611646
static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
612647
BasicBlock *const InsertBefore,
@@ -640,7 +675,7 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
640675
// If there is one block that requires splitting, we do it now, i.e.
641676
// just before we know we will collapse the chain. The instructions
642677
// can be executed before any of the instructions in the chain.
643-
const auto ToSplit = llvm::find_if(
678+
const auto *ToSplit = llvm::find_if(
644679
Comparisons, [](const BCECmpBlock &B) { return B.RequireSplit; });
645680
if (ToSplit != Comparisons.end()) {
646681
LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
@@ -655,6 +690,7 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
655690
LhsLoad->replaceUsesOfWith(LhsLoad->getOperand(0), Lhs);
656691
RhsLoad->replaceUsesOfWith(RhsLoad->getOperand(0), Rhs);
657692
// There are no blocks to merge, just do the comparison.
693+
// If we condition on this IsEqual, we already have its probabilities.
658694
IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
659695
} else {
660696
const unsigned TotalSizeBits = std::accumulate(
@@ -684,7 +720,9 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
684720
DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}});
685721
} else {
686722
// Continue to next block if equal, exit to phi else.
687-
Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
723+
auto *BI = Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
724+
if (auto BranchWeights = computeMergedBranchWeights(Comparisons))
725+
setBranchWeights(*BI, BranchWeights.value(), /*IsExpected=*/false);
688726
Phi.addIncoming(ConstantInt::getFalse(Context), BB);
689727
DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock},
690728
{DominatorTree::Insert, BB, PhiBB}});

llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll

Lines changed: 25 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
22
; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s --check-prefix=X86
33

44
%S = type { i32, i32, i32, i32, i32}
@@ -15,7 +15,7 @@ define zeroext i1 @opeq1(
1515
; X86-NEXT: ret i1 [[TMP2]]
1616
;
1717
ptr nocapture readonly dereferenceable(16) %a,
18-
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync {
18+
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !2 {
1919

2020
entry:
2121
%ptr = alloca i32
@@ -24,23 +24,23 @@ entry:
2424
; Does other work, has no interference, merge block
2525
store i32 42, ptr %ptr
2626
%cmp.i = icmp eq i32 %0, %1
27-
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
27+
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !3
2828

2929
land.rhs.i:
3030
%second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1
3131
%2 = load i32, ptr %second.i, align 4
3232
%second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1
3333
%3 = load i32, ptr %second2.i, align 4
3434
%cmp2.i = icmp eq i32 %2, %3
35-
br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
35+
br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit, !prof !4
3636

3737
land.rhs.i.2:
3838
%third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2
3939
%4 = load i32, ptr %third.i, align 4
4040
%third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
4141
%5 = load i32, ptr %third2.i, align 4
4242
%cmp3.i = icmp eq i32 %4, %5
43-
br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
43+
br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit, !prof !5
4444

4545
land.rhs.i.3:
4646
%fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
@@ -55,15 +55,15 @@ opeq1.exit:
5555
ret i1 %8
5656
}
5757

58-
define zeroext i1 @part_sequent_eq_with_metadata() {
58+
define zeroext i1 @part_sequent_eq_with_metadata() !prof !2 {
5959
; X86-LABEL: @part_sequent_eq_with_metadata(
6060
; X86-NEXT: bb01:
6161
; X86-NEXT: [[A:%.*]] = alloca [[S:%.*]], align 8
6262
; X86-NEXT: [[B:%.*]] = alloca [[S]], align 8
63-
; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG0:![0-9]+]], !noundef !1
64-
; X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG0]], !noundef !1
63+
; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG1:![0-9]+]], !noundef [[META2:![0-9]+]]
64+
; X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG1]], !noundef [[META2]]
6565
; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
66-
; X86-NEXT: br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]]
66+
; X86-NEXT: br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]], !prof [[PROF3:![0-9]+]]
6767
; X86: "bb1+bb2+bb3":
6868
; X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 2
6969
; X86-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 2
@@ -80,23 +80,23 @@ bb0:
8080
%value0 = load i32, ptr %a, align 4, !range !0, !noundef !1
8181
%value1 = load i32, ptr %b, align 4, !range !0, !noundef !1
8282
%cmp.i = icmp eq i32 %value0, %value1
83-
br i1 %cmp.i, label %bb1, label %exit
83+
br i1 %cmp.i, label %bb1, label %exit, !prof !3
8484

8585
bb1:
8686
%second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2
8787
%value2 = load i32, ptr %second.i, align 4
8888
%second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
8989
%value3 = load i32, ptr %second2.i, align 4
9090
%cmp2.i = icmp eq i32 %value2, %value3
91-
br i1 %cmp2.i, label %bb2, label %exit
91+
br i1 %cmp2.i, label %bb2, label %exit, !prof !4
9292

9393
bb2:
9494
%third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
9595
%value4 = load i32, ptr %third.i, align 4
9696
%third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 3
9797
%value5 = load i32, ptr %third2.i, align 4
9898
%cmp3.i = icmp eq i32 %value4, %value5
99-
br i1 %cmp3.i, label %bb3, label %exit
99+
br i1 %cmp3.i, label %bb3, label %exit, !prof !5
100100

101101
bb3:
102102
%fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 4
@@ -113,3 +113,16 @@ exit:
113113

114114
!0 = !{i32 0, i32 2}
115115
!1 = !{}
116+
!2 = !{!"function_entry_count", i32 100}
117+
!3 = !{!"branch_weights", i32 2, i32 3}
118+
!4 = !{!"branch_weights", i32 5, i32 7}
119+
!5 = !{!"branch_weights", i32 11, i32 13}
120+
;.
121+
; X86: attributes #[[ATTR0:[0-9]+]] = { nofree nosync }
122+
; X86: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) }
123+
;.
124+
; X86: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
125+
; X86: [[RNG1]] = !{i32 0, i32 2}
126+
; X86: [[META2]] = !{}
127+
; X86: [[PROF3]] = !{!"branch_weights", i32 2, i32 3}
128+
;.

llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
1+
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
22
; RUN: opt < %s -passes=mergeicmps -verify-dom-info -mtriple=x86_64-unknown-unknown -S | FileCheck %s
33

44
%S = type { i32, i32, i32, i32 }
@@ -15,11 +15,11 @@ define zeroext i1 @opeq1(
1515
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
1616
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
1717
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
18-
; CHECK-NEXT: br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]]
18+
; CHECK-NEXT: br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]], !prof [[PROF1:![0-9]+]]
1919
; CHECK: "land.rhs.i+land.rhs.i.2":
2020
; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i64 8)
2121
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[MEMCMP]], 0
22-
; CHECK-NEXT: br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]]
22+
; CHECK-NEXT: br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]], !prof [[PROF2:![0-9]+]]
2323
; CHECK: land.rhs.i.31:
2424
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 3
2525
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 3
@@ -32,28 +32,28 @@ define zeroext i1 @opeq1(
3232
; CHECK-NEXT: ret i1 [[TMP11]]
3333
;
3434
ptr nocapture readonly dereferenceable(16) %a,
35-
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync {
35+
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !0 {
3636
entry:
3737
%first.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
3838
%0 = load i32, ptr %first.i, align 4
3939
%first1.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
4040
%1 = load i32, ptr %first1.i, align 4
4141
%cmp.i = icmp eq i32 %0, %1
42-
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
42+
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !1
4343

4444
land.rhs.i:
4545
%2 = load i32, ptr %a, align 4
4646
%3 = load i32, ptr %b, align 4
4747
%cmp3.i = icmp eq i32 %2, %3
48-
br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit
48+
br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit, !prof !2
4949

5050
land.rhs.i.2:
5151
%third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1
5252
%4 = load i32, ptr %third.i, align 4
5353
%third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1
5454
%5 = load i32, ptr %third2.i, align 4
5555
%cmp4.i = icmp eq i32 %4, %5
56-
br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit
56+
br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit, !prof !3
5757

5858
land.rhs.i.3:
5959
%fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
@@ -67,3 +67,16 @@ opeq1.exit:
6767
%8 = phi i1 [ false, %entry ], [ false, %land.rhs.i], [ false, %land.rhs.i.2 ], [ %cmp5.i, %land.rhs.i.3 ]
6868
ret i1 %8
6969
}
70+
71+
!0 = !{!"function_entry_count", i32 10}
72+
!1 = !{!"branch_weights", i32 2, i32 3}
73+
!2 = !{!"branch_weights", i32 5, i32 7}
74+
!3 = !{!"branch_weights", i32 11, i32 13}
75+
;.
76+
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync }
77+
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) }
78+
;.
79+
; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
80+
; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
81+
; CHECK: [[PROF2]] = !{!"branch_weights", i32 55, i32 233}
82+
;.

0 commit comments

Comments
 (0)