Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions llvm/include/llvm/IR/ProfDataUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,9 @@ LLVM_ABI bool extractProfTotalWeight(const Instruction &I,
LLVM_ABI void setBranchWeights(Instruction &I, ArrayRef<uint32_t> Weights,
bool IsExpected, bool ElideAllZero = false);

/// Push the weights right to fit in uint32_t.
LLVM_ABI SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights);

/// Variant of `setBranchWeights` where the `Weights` will be fit first to
/// uint32_t by shifting right.
LLVM_ABI void setFittedBranchWeights(Instruction &I, ArrayRef<uint64_t> Weights,
Expand Down
2 changes: 1 addition & 1 deletion llvm/lib/IR/ProfDataUtils.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ static void extractFromBranchWeightMD(const MDNode *ProfileData,
}

/// Push the weights right to fit in uint32_t.
static SmallVector<uint32_t> fitWeights(ArrayRef<uint64_t> Weights) {
SmallVector<uint32_t> llvm::fitWeights(ArrayRef<uint64_t> Weights) {
SmallVector<uint32_t> Ret;
Ret.reserve(Weights.size());
uint64_t Max = *llvm::max_element(Weights);
Expand Down
44 changes: 41 additions & 3 deletions llvm/lib/Transforms/Scalar/MergeICmps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,9 @@
#include "llvm/Analysis/TargetTransformInfo.h"
#include "llvm/IR/Dominators.h"
#include "llvm/IR/Function.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/IRBuilder.h"
#include "llvm/IR/Instruction.h"
#include "llvm/IR/ProfDataUtils.h"
#include "llvm/InitializePasses.h"
#include "llvm/Pass.h"
#include "llvm/Transforms/Scalar.h"
Expand All @@ -66,6 +67,9 @@ using namespace llvm;

#define DEBUG_TYPE "mergeicmps"

namespace llvm {
extern cl::opt<bool> ProfcheckDisableMetadataFixes;
} // namespace llvm
namespace {

// A BCE atom "Binary Compare Expression Atom" represents an integer load
Expand Down Expand Up @@ -607,6 +611,37 @@ class MergedBlockName {
};
} // namespace

/// Determine the branch weights for the resulting conditional branch, resulting
/// after merging \p Comparisons.
static std::optional<SmallVector<uint32_t, 2>>
computeMergedBranchWeights(ArrayRef<BCECmpBlock> Comparisons) {
assert(!Comparisons.empty());
if (ProfcheckDisableMetadataFixes)
return std::nullopt;
if (Comparisons.size() == 1) {
SmallVector<uint32_t, 2> Weights;
if (!extractBranchWeights(*Comparisons[0].BB->getTerminator(), Weights))
return std::nullopt;
return Weights;
}
// The probability to go to the phi block is the disjunction of the
// probability to go to the phi block from the individual Comparisons. We'll
// swap the weights because `getDisjunctionWeights` computes the disjunction
// for the "true" branch, then swap back.
SmallVector<uint64_t, 2> Weights{0, 1};
// At this point, Weights encodes "0-probability" for the "true" side.
for (const auto &C : Comparisons) {
SmallVector<uint32_t, 2> W;
if (!extractBranchWeights(*C.BB->getTerminator(), W))
return std::nullopt;

std::swap(W[0], W[1]);
Weights = getDisjunctionWeights(Weights, W);
}
std::swap(Weights[0], Weights[1]);
return fitWeights(Weights);
}

// Merges the given contiguous comparison blocks into one memcmp block.
static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
BasicBlock *const InsertBefore,
Expand Down Expand Up @@ -640,7 +675,7 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
// If there is one block that requires splitting, we do it now, i.e.
// just before we know we will collapse the chain. The instructions
// can be executed before any of the instructions in the chain.
const auto ToSplit = llvm::find_if(
const auto *ToSplit = llvm::find_if(
Comparisons, [](const BCECmpBlock &B) { return B.RequireSplit; });
if (ToSplit != Comparisons.end()) {
LLVM_DEBUG(dbgs() << "Splitting non_BCE work to header\n");
Expand All @@ -655,6 +690,7 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
LhsLoad->replaceUsesOfWith(LhsLoad->getOperand(0), Lhs);
RhsLoad->replaceUsesOfWith(RhsLoad->getOperand(0), Rhs);
// There are no blocks to merge, just do the comparison.
// If we condition on this IsEqual, we already have its probabilities.
IsEqual = Builder.CreateICmpEQ(LhsLoad, RhsLoad);
} else {
const unsigned TotalSizeBits = std::accumulate(
Expand Down Expand Up @@ -684,7 +720,9 @@ static BasicBlock *mergeComparisons(ArrayRef<BCECmpBlock> Comparisons,
DTU.applyUpdates({{DominatorTree::Insert, BB, PhiBB}});
} else {
// Continue to next block if equal, exit to phi else.
Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
auto *BI = Builder.CreateCondBr(IsEqual, NextCmpBlock, PhiBB);
if (auto BranchWeights = computeMergedBranchWeights(Comparisons))
setBranchWeights(*BI, BranchWeights.value(), /*IsExpected=*/false);
Phi.addIncoming(ConstantInt::getFalse(Context), BB);
DTU.applyUpdates({{DominatorTree::Insert, BB, NextCmpBlock},
{DominatorTree::Insert, BB, PhiBB}});
Expand Down
37 changes: 25 additions & 12 deletions llvm/test/Transforms/MergeICmps/X86/alias-merge-blocks.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt < %s -mtriple=x86_64-unknown-unknown -passes=mergeicmps -verify-dom-info -S | FileCheck %s --check-prefix=X86

%S = type { i32, i32, i32, i32, i32}
Expand All @@ -15,7 +15,7 @@ define zeroext i1 @opeq1(
; X86-NEXT: ret i1 [[TMP2]]
;
ptr nocapture readonly dereferenceable(16) %a,
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync {
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !2 {

entry:
%ptr = alloca i32
Expand All @@ -24,23 +24,23 @@ entry:
; Does other work, has no interference, merge block
store i32 42, ptr %ptr
%cmp.i = icmp eq i32 %0, %1
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !3

land.rhs.i:
%second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1
%2 = load i32, ptr %second.i, align 4
%second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1
%3 = load i32, ptr %second2.i, align 4
%cmp2.i = icmp eq i32 %2, %3
br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit
br i1 %cmp2.i, label %land.rhs.i.2, label %opeq1.exit, !prof !4

land.rhs.i.2:
%third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2
%4 = load i32, ptr %third.i, align 4
%third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
%5 = load i32, ptr %third2.i, align 4
%cmp3.i = icmp eq i32 %4, %5
br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit
br i1 %cmp3.i, label %land.rhs.i.3, label %opeq1.exit, !prof !5

land.rhs.i.3:
%fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
Expand All @@ -55,15 +55,15 @@ opeq1.exit:
ret i1 %8
}

define zeroext i1 @part_sequent_eq_with_metadata() {
define zeroext i1 @part_sequent_eq_with_metadata() !prof !2 {
; X86-LABEL: @part_sequent_eq_with_metadata(
; X86-NEXT: bb01:
; X86-NEXT: [[A:%.*]] = alloca [[S:%.*]], align 8
; X86-NEXT: [[B:%.*]] = alloca [[S]], align 8
; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG0:![0-9]+]], !noundef !1
; X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG0]], !noundef !1
; X86-NEXT: [[TMP0:%.*]] = load i32, ptr [[A]], align 4, !range [[RNG1:![0-9]+]], !noundef [[META2:![0-9]+]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why the rename RNG0 -> RNG1 ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

UTC did that. Probably because we now pass --check-globals and some metadata is now apparent to UTC, and occurs earlier.

; X86-NEXT: [[TMP1:%.*]] = load i32, ptr [[B]], align 4, !range [[RNG1]], !noundef [[META2]]
; X86-NEXT: [[TMP2:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
; X86-NEXT: br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]]
; X86-NEXT: br i1 [[TMP2]], label %"bb1+bb2+bb3", label [[EXIT:%.*]], !prof [[PROF3:![0-9]+]]
; X86: "bb1+bb2+bb3":
; X86-NEXT: [[TMP3:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 2
; X86-NEXT: [[TMP4:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 2
Expand All @@ -80,23 +80,23 @@ bb0:
%value0 = load i32, ptr %a, align 4, !range !0, !noundef !1
%value1 = load i32, ptr %b, align 4, !range !0, !noundef !1
%cmp.i = icmp eq i32 %value0, %value1
br i1 %cmp.i, label %bb1, label %exit
br i1 %cmp.i, label %bb1, label %exit, !prof !3

bb1:
%second.i = getelementptr inbounds %S, ptr %a, i64 0, i32 2
%value2 = load i32, ptr %second.i, align 4
%second2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
%value3 = load i32, ptr %second2.i, align 4
%cmp2.i = icmp eq i32 %value2, %value3
br i1 %cmp2.i, label %bb2, label %exit
br i1 %cmp2.i, label %bb2, label %exit, !prof !4

bb2:
%third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
%value4 = load i32, ptr %third.i, align 4
%third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 3
%value5 = load i32, ptr %third2.i, align 4
%cmp3.i = icmp eq i32 %value4, %value5
br i1 %cmp3.i, label %bb3, label %exit
br i1 %cmp3.i, label %bb3, label %exit, !prof !5

bb3:
%fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 4
Expand All @@ -113,3 +113,16 @@ exit:

!0 = !{i32 0, i32 2}
!1 = !{}
!2 = !{!"function_entry_count", i32 100}
!3 = !{!"branch_weights", i32 2, i32 3}
!4 = !{!"branch_weights", i32 5, i32 7}
!5 = !{!"branch_weights", i32 11, i32 13}
;.
; X86: attributes #[[ATTR0:[0-9]+]] = { nofree nosync }
; X86: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) }
;.
; X86: [[META0:![0-9]+]] = !{!"function_entry_count", i32 100}
; X86: [[RNG1]] = !{i32 0, i32 2}
; X86: [[META2]] = !{}
; X86: [[PROF3]] = !{!"branch_weights", i32 2, i32 3}
;.
27 changes: 20 additions & 7 deletions llvm/test/Transforms/MergeICmps/X86/entry-block-shuffled.ll
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals
; RUN: opt < %s -passes=mergeicmps -verify-dom-info -mtriple=x86_64-unknown-unknown -S | FileCheck %s

%S = type { i32, i32, i32, i32 }
Expand All @@ -15,11 +15,11 @@ define zeroext i1 @opeq1(
; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP0]], align 4
; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[TMP2]], [[TMP3]]
; CHECK-NEXT: br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]]
; CHECK-NEXT: br i1 [[TMP4]], label %"land.rhs.i+land.rhs.i.2", label [[OPEQ1_EXIT:%.*]], !prof [[PROF1:![0-9]+]]
; CHECK: "land.rhs.i+land.rhs.i.2":
; CHECK-NEXT: [[MEMCMP:%.*]] = call i32 @memcmp(ptr [[A]], ptr [[B]], i64 8)
; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[MEMCMP]], 0
; CHECK-NEXT: br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]]
; CHECK-NEXT: br i1 [[TMP5]], label [[LAND_RHS_I_31:%.*]], label [[OPEQ1_EXIT]], !prof [[PROF2:![0-9]+]]
; CHECK: land.rhs.i.31:
; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [[S]], ptr [[A]], i64 0, i32 3
; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds [[S]], ptr [[B]], i64 0, i32 3
Expand All @@ -32,28 +32,28 @@ define zeroext i1 @opeq1(
; CHECK-NEXT: ret i1 [[TMP11]]
;
ptr nocapture readonly dereferenceable(16) %a,
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync {
ptr nocapture readonly dereferenceable(16) %b) local_unnamed_addr nofree nosync !prof !0 {
entry:
%first.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
%0 = load i32, ptr %first.i, align 4
%first1.i = getelementptr inbounds %S, ptr %b, i64 0, i32 2
%1 = load i32, ptr %first1.i, align 4
%cmp.i = icmp eq i32 %0, %1
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit
br i1 %cmp.i, label %land.rhs.i, label %opeq1.exit, !prof !1

land.rhs.i:
%2 = load i32, ptr %a, align 4
%3 = load i32, ptr %b, align 4
%cmp3.i = icmp eq i32 %2, %3
br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit
br i1 %cmp3.i, label %land.rhs.i.2, label %opeq1.exit, !prof !2

land.rhs.i.2:
%third.i = getelementptr inbounds %S, ptr %a, i64 0, i32 1
%4 = load i32, ptr %third.i, align 4
%third2.i = getelementptr inbounds %S, ptr %b, i64 0, i32 1
%5 = load i32, ptr %third2.i, align 4
%cmp4.i = icmp eq i32 %4, %5
br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit
br i1 %cmp4.i, label %land.rhs.i.3, label %opeq1.exit, !prof !3

land.rhs.i.3:
%fourth.i = getelementptr inbounds %S, ptr %a, i64 0, i32 3
Expand All @@ -67,3 +67,16 @@ opeq1.exit:
%8 = phi i1 [ false, %entry ], [ false, %land.rhs.i], [ false, %land.rhs.i.2 ], [ %cmp5.i, %land.rhs.i.3 ]
ret i1 %8
}

!0 = !{!"function_entry_count", i32 10}
!1 = !{!"branch_weights", i32 2, i32 3}
!2 = !{!"branch_weights", i32 5, i32 7}
!3 = !{!"branch_weights", i32 11, i32 13}
;.
; CHECK: attributes #[[ATTR0:[0-9]+]] = { nofree nosync }
; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: read) }
;.
; CHECK: [[META0:![0-9]+]] = !{!"function_entry_count", i32 10}
; CHECK: [[PROF1]] = !{!"branch_weights", i32 2, i32 3}
; CHECK: [[PROF2]] = !{!"branch_weights", i32 55, i32 233}
;.
Loading