Skip to content
Open
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 38 additions & 15 deletions llvm/lib/CodeGen/BranchFolding.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include "llvm/CodeGen/Analysis.h"
#include "llvm/CodeGen/BranchFoldingPass.h"
#include "llvm/CodeGen/MBFIWrapper.h"
#include "llvm/CodeGen/MachineBasicBlock.h"
#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
#include "llvm/CodeGen/MachineFunction.h"
Expand Down Expand Up @@ -1553,32 +1554,54 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) {
MachineInstr &TailCall = *MBB->getFirstNonDebugInstr();
if (TII->isUnconditionalTailCall(TailCall)) {
SmallVector<MachineBasicBlock *> PredsChanged;
for (auto &Pred : MBB->predecessors()) {
for (auto *Pred : MBB->predecessors()) {
bool PredHasHotSuccessor = false;
for (MachineBasicBlock *const PredSucc : Pred->successors()) {
PredHasHotSuccessor |= MBPI.isEdgeHot(Pred, PredSucc);
}

MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
SmallVector<MachineOperand, 4> PredCond;
bool PredAnalyzable =
!TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);

// Only eliminate if MBB == TBB (Taken Basic Block)
if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB &&
PredTBB != PredFBB) {
// The predecessor has a conditional branch to this block which
// consists of only a tail call. Try to fold the tail call into the
// conditional branch.
bool IsEdgeCold = !MBPI.isEdgeHot(Pred, MBB);
bool CanFoldFallThrough =
PredHasHotSuccessor && IsEdgeCold &&
(MBB == PredFBB ||
(PredFBB == nullptr && Pred->getFallThrough() == MBB));
bool CanFoldTakenBlock =
(MBB == PredTBB && (PredHasHotSuccessor ? IsEdgeCold : true));

// When we have PGO (or equivalent) information, we want to fold the
// fallthrough if it's cold. Folding a fallthrough puts it behind a
// conditional branch which isn't desirable if it's hot. When there
// isn't any PGO information available we want to fold the taken block
// if it's possible and we never want to fold the fallthrough as we
// don't know if that is desirable.
if (PredAnalyzable && !PredCond.empty() && PredTBB != PredFBB &&
(CanFoldTakenBlock || CanFoldFallThrough)) {
SmallVector<MachineOperand, 4> ReversedCond(PredCond);
if (CanFoldFallThrough) {
DebugLoc Dl = MBB->findBranchDebugLoc();
TII->reverseBranchCondition(ReversedCond);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what happens to the profile (branch weights) in this case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for looking at this @mtrofin.

My understanding is that the weights are still correct because we're eleminating one of the successors so the weights for that just get dropped.

I've updated my patch to ensure that if we get to the point where we reverse the branch we're sure that we're going to form a conditional tailcall.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But isn't the tail call still conditioned on ReversedCond? From what I read in e.g. the @true_likely test , you end up with:

testl %edi, %edi # encoding: [0x85,0xff]
je func_false # TAILCALL
<...fallthrough...>

So there's still a question of what's the probability you jump to func_false vs func_true (the fallthrough). What am I missing?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think I'm not understanding your concern, could you please clarify further?

With my patch applied we get the following probability out of branch-folder for the true_likely test:

bb.0 (%ir-block.1):
  successors: %bb.1(0x7fef9fcb); %bb.1(99.95%)
...

%bb.1 is the hot fallthrough successor.

The probabilities in the input MIR are:

...
successors: %bb.1(0x7fef9fcb), %bb.2(0x00106035); %bb.1(99.95%), %bb.2(0.05%)
...

This seems correct to me based on the following assumptions:

  • I don't think there's a way of expressing the probability of a tailcall edge.
  • The probability of reaching the successor appears to be correct.

TII->removeBranch(*Pred);
TII->insertBranch(*Pred, MBB, PredTBB, ReversedCond, Dl);
}

PredAnalyzable =
!TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true);

if (TII->canMakeTailCallConditional(PredCond, TailCall)) {
// TODO: It would be nice if analyzeBranch() could provide a pointer
// to the branch instruction so replaceBranchWithTailCall() doesn't
// have to search for it.
// TODO: It would be nice if analyzeBranch() could provide a
// pointer to the branch instruction so
// replaceBranchWithTailCall() doesn't have to search for it.
TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall);
PredsChanged.push_back(Pred);
}
}
// If the predecessor is falling through to this block, we could reverse
// the branch condition and fold the tail call into that. However, after
// that we might have to re-arrange the CFG to fall through to the other
// block and there is a high risk of regressing code size rather than
// improving it.
}

if (!PredsChanged.empty()) {
NumTailCalls += PredsChanged.size();
for (auto &Pred : PredsChanged)
Expand Down
101 changes: 101 additions & 0 deletions llvm/test/CodeGen/X86/conditional-tailcall.ll
Original file line number Diff line number Diff line change
Expand Up @@ -596,3 +596,104 @@ cleanup.thread: ; preds = %cleanup.thread.loop
%6 = phi i1 [ %cmp37, %5 ], [ %call34, %if.else28 ], [ false, %cleanup.thread.loopexit ]
ret i1 %6
}

define void @true_likely(i1 noundef zeroext %0) {
; CHECK32-LABEL: true_likely:
; CHECK32: # %bb.0:
; CHECK32-NEXT: cmpb $0, {{[0-9]+}}(%esp) # encoding: [0x80,0x7c,0x24,0x04,0x00]
; CHECK32-NEXT: je func_false # TAILCALL
; CHECK32-NEXT: # encoding: [0x74,A]
; CHECK32-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1
; CHECK32-NEXT: # %bb.1:
; CHECK32-NEXT: jmp func_true # TAILCALL
; CHECK32-NEXT: # encoding: [0xeb,A]
; CHECK32-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1
;
; CHECK64-LABEL: true_likely:
; CHECK64: # %bb.0:
; CHECK64-NEXT: testl %edi, %edi # encoding: [0x85,0xff]
; CHECK64-NEXT: je func_false # TAILCALL
; CHECK64-NEXT: # encoding: [0x74,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.1:
; CHECK64-NEXT: jmp func_true # TAILCALL
; CHECK64-NEXT: # encoding: [0xeb,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1
;
; WIN64-LABEL: true_likely:
; WIN64: # %bb.0:
; WIN64-NEXT: testb %cl, %cl # encoding: [0x84,0xc9]
; WIN64-NEXT: je func_false # TAILCALL
; WIN64-NEXT: # encoding: [0x74,A]
; WIN64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1
; WIN64-NEXT: # %bb.1:
; WIN64-NEXT: jmp func_true # TAILCALL
; WIN64-NEXT: # encoding: [0xeb,A]
; WIN64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1
br i1 %0, label %2, label %3, !prof !6

2:
tail call void @func_true()
br label %4

3:
tail call void @func_false()
br label %4

4:
ret void
}

define void @false_likely(i1 noundef zeroext %0) {
; CHECK32-LABEL: false_likely:
; CHECK32: # %bb.0:
; CHECK32-NEXT: cmpb $0, {{[0-9]+}}(%esp) # encoding: [0x80,0x7c,0x24,0x04,0x00]
; CHECK32-NEXT: jne func_true # TAILCALL
; CHECK32-NEXT: # encoding: [0x75,A]
; CHECK32-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1
; CHECK32-NEXT: # %bb.1:
; CHECK32-NEXT: jmp func_false # TAILCALL
; CHECK32-NEXT: # encoding: [0xeb,A]
; CHECK32-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1
;
; CHECK64-LABEL: false_likely:
; CHECK64: # %bb.0:
; CHECK64-NEXT: testl %edi, %edi # encoding: [0x85,0xff]
; CHECK64-NEXT: jne func_true # TAILCALL
; CHECK64-NEXT: # encoding: [0x75,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1
; CHECK64-NEXT: # %bb.1:
; CHECK64-NEXT: jmp func_false # TAILCALL
; CHECK64-NEXT: # encoding: [0xeb,A]
; CHECK64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1
;
; WIN64-LABEL: false_likely:
; WIN64: # %bb.0:
; WIN64-NEXT: testb %cl, %cl # encoding: [0x84,0xc9]
; WIN64-NEXT: jne func_true # TAILCALL
; WIN64-NEXT: # encoding: [0x75,A]
; WIN64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1
; WIN64-NEXT: # %bb.1:
; WIN64-NEXT: jmp func_false # TAILCALL
; WIN64-NEXT: # encoding: [0xeb,A]
; WIN64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1
br i1 %0, label %2, label %3, !prof !7

2:
tail call void @func_true()
br label %4

3:
tail call void @func_false()
br label %4

4:
ret void
}

!6 = !{!"branch_weights", !"expected", i32 2000, i32 1}
!7 = !{!"branch_weights", !"expected", i32 1, i32 2000}


declare dso_local void @func_true()
declare dso_local void @func_false()
60 changes: 40 additions & 20 deletions llvm/test/CodeGen/X86/segmented-stacks.ll
Original file line number Diff line number Diff line change
Expand Up @@ -1732,8 +1732,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X86-Linux-LABEL: test_sibling_call_empty_frame:
; X86-Linux: # %bb.0:
; X86-Linux-NEXT: cmpl %gs:48, %esp
; X86-Linux-NEXT: ja callee@PLT # TAILCALL
; X86-Linux-NEXT: # %bb.1:
; X86-Linux-NEXT: jbe .LBB8_1
; X86-Linux-NEXT: # %bb.2:
; X86-Linux-NEXT: jmp callee@PLT # TAILCALL
; X86-Linux-NEXT: .LBB8_1:
; X86-Linux-NEXT: pushl $4
; X86-Linux-NEXT: pushl $0
; X86-Linux-NEXT: calll __morestack
Expand All @@ -1743,8 +1745,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X64-Linux-LABEL: test_sibling_call_empty_frame:
; X64-Linux: # %bb.0:
; X64-Linux-NEXT: cmpq %fs:112, %rsp
; X64-Linux-NEXT: ja callee@PLT # TAILCALL
; X64-Linux-NEXT: # %bb.1:
; X64-Linux-NEXT: jbe .LBB8_1
; X64-Linux-NEXT: # %bb.2:
; X64-Linux-NEXT: jmp callee@PLT # TAILCALL
; X64-Linux-NEXT: .LBB8_1:
; X64-Linux-NEXT: movl $0, %r10d
; X64-Linux-NEXT: movl $0, %r11d
; X64-Linux-NEXT: callq __morestack
Expand All @@ -1769,8 +1773,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X32ABI-LABEL: test_sibling_call_empty_frame:
; X32ABI: # %bb.0:
; X32ABI-NEXT: cmpl %fs:64, %esp
; X32ABI-NEXT: ja callee@PLT # TAILCALL
; X32ABI-NEXT: # %bb.1:
; X32ABI-NEXT: jbe .LBB8_1
; X32ABI-NEXT: # %bb.2:
; X32ABI-NEXT: jmp callee@PLT # TAILCALL
; X32ABI-NEXT: .LBB8_1:
; X32ABI-NEXT: movl $0, %r10d
; X32ABI-NEXT: movl $0, %r11d
; X32ABI-NEXT: callq __morestack
Expand All @@ -1781,8 +1787,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X86-Darwin: ## %bb.0:
; X86-Darwin-NEXT: movl $432, %ecx ## imm = 0x1B0
; X86-Darwin-NEXT: cmpl %gs:(%ecx), %esp
; X86-Darwin-NEXT: ja _callee ## TAILCALL
; X86-Darwin-NEXT: ## %bb.1:
; X86-Darwin-NEXT: jbe LBB8_1
; X86-Darwin-NEXT: ## %bb.2:
; X86-Darwin-NEXT: jmp _callee ## TAILCALL
; X86-Darwin-NEXT: LBB8_1:
; X86-Darwin-NEXT: pushl $4
; X86-Darwin-NEXT: pushl $0
; X86-Darwin-NEXT: calll ___morestack
Expand All @@ -1792,8 +1800,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X64-Darwin-LABEL: test_sibling_call_empty_frame:
; X64-Darwin: ## %bb.0:
; X64-Darwin-NEXT: cmpq %gs:816, %rsp
; X64-Darwin-NEXT: ja _callee ## TAILCALL
; X64-Darwin-NEXT: ## %bb.1:
; X64-Darwin-NEXT: jbe LBB8_1
; X64-Darwin-NEXT: ## %bb.2:
; X64-Darwin-NEXT: jmp _callee ## TAILCALL
; X64-Darwin-NEXT: LBB8_1:
; X64-Darwin-NEXT: movl $0, %r10d
; X64-Darwin-NEXT: movl $0, %r11d
; X64-Darwin-NEXT: callq ___morestack
Expand All @@ -1803,8 +1813,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X86-MinGW-LABEL: test_sibling_call_empty_frame:
; X86-MinGW: # %bb.0:
; X86-MinGW-NEXT: cmpl %fs:20, %esp
; X86-MinGW-NEXT: ja _callee # TAILCALL
; X86-MinGW-NEXT: # %bb.1:
; X86-MinGW-NEXT: jbe LBB8_1
; X86-MinGW-NEXT: # %bb.2:
; X86-MinGW-NEXT: jmp _callee # TAILCALL
; X86-MinGW-NEXT: LBB8_1:
; X86-MinGW-NEXT: pushl $4
; X86-MinGW-NEXT: pushl $0
; X86-MinGW-NEXT: calll ___morestack
Expand All @@ -1814,8 +1826,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X64-FreeBSD-LABEL: test_sibling_call_empty_frame:
; X64-FreeBSD: # %bb.0:
; X64-FreeBSD-NEXT: cmpq %fs:24, %rsp
; X64-FreeBSD-NEXT: ja callee@PLT # TAILCALL
; X64-FreeBSD-NEXT: # %bb.1:
; X64-FreeBSD-NEXT: jbe .LBB8_1
; X64-FreeBSD-NEXT: # %bb.2:
; X64-FreeBSD-NEXT: jmp callee@PLT # TAILCALL
; X64-FreeBSD-NEXT: .LBB8_1:
; X64-FreeBSD-NEXT: movl $0, %r10d
; X64-FreeBSD-NEXT: movl $0, %r11d
; X64-FreeBSD-NEXT: callq __morestack
Expand All @@ -1825,8 +1839,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X86-DFlyBSD-LABEL: test_sibling_call_empty_frame:
; X86-DFlyBSD: # %bb.0:
; X86-DFlyBSD-NEXT: cmpl %fs:16, %esp
; X86-DFlyBSD-NEXT: ja callee@PLT # TAILCALL
; X86-DFlyBSD-NEXT: # %bb.1:
; X86-DFlyBSD-NEXT: jbe .LBB8_1
; X86-DFlyBSD-NEXT: # %bb.2:
; X86-DFlyBSD-NEXT: jmp callee@PLT # TAILCALL
; X86-DFlyBSD-NEXT: .LBB8_1:
; X86-DFlyBSD-NEXT: pushl $4
; X86-DFlyBSD-NEXT: pushl $0
; X86-DFlyBSD-NEXT: calll __morestack
Expand All @@ -1836,8 +1852,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X64-DFlyBSD-LABEL: test_sibling_call_empty_frame:
; X64-DFlyBSD: # %bb.0:
; X64-DFlyBSD-NEXT: cmpq %fs:32, %rsp
; X64-DFlyBSD-NEXT: ja callee@PLT # TAILCALL
; X64-DFlyBSD-NEXT: # %bb.1:
; X64-DFlyBSD-NEXT: jbe .LBB8_1
; X64-DFlyBSD-NEXT: # %bb.2:
; X64-DFlyBSD-NEXT: jmp callee@PLT # TAILCALL
; X64-DFlyBSD-NEXT: .LBB8_1:
; X64-DFlyBSD-NEXT: movl $0, %r10d
; X64-DFlyBSD-NEXT: movl $0, %r11d
; X64-DFlyBSD-NEXT: callq __morestack
Expand All @@ -1847,8 +1865,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 {
; X64-MinGW-LABEL: test_sibling_call_empty_frame:
; X64-MinGW: # %bb.0:
; X64-MinGW-NEXT: cmpq %gs:40, %rsp
; X64-MinGW-NEXT: ja callee # TAILCALL
; X64-MinGW-NEXT: # %bb.1:
; X64-MinGW-NEXT: jbe .LBB8_1
; X64-MinGW-NEXT: # %bb.2:
; X64-MinGW-NEXT: jmp callee # TAILCALL
; X64-MinGW-NEXT: .LBB8_1:
; X64-MinGW-NEXT: movl $0, %r10d
; X64-MinGW-NEXT: movl $32, %r11d
; X64-MinGW-NEXT: callq __morestack
Expand Down
8 changes: 6 additions & 2 deletions llvm/test/CodeGen/X86/switch-bt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,9 @@ define void @test2(i32 %x) nounwind ssp {
; CHECK-NEXT: # %bb.1: # %entry
; CHECK-NEXT: movl $91, %eax
; CHECK-NEXT: btl %edi, %eax
; CHECK-NEXT: jb bar@PLT # TAILCALL
; CHECK-NEXT: jae .LBB1_2
; CHECK-NEXT: # %bb.3: # %if.then
; CHECK-NEXT: jmp bar@PLT # TAILCALL
; CHECK-NEXT: .LBB1_2: # %if.end
; CHECK-NEXT: retq

Expand Down Expand Up @@ -116,7 +118,9 @@ define void @test3(i32 %x) nounwind {
; CHECK-NEXT: ja .LBB2_2
; CHECK-NEXT: # %bb.1:
; CHECK-NEXT: cmpl $4, %edi
; CHECK-NEXT: jne bar@PLT # TAILCALL
; CHECK-NEXT: je .LBB2_2
; CHECK-NEXT: # %bb.3: # %if.then
; CHECK-NEXT: jmp bar@PLT # TAILCALL
; CHECK-NEXT: .LBB2_2: # %if.end
; CHECK-NEXT: retq
switch i32 %x, label %if.end [
Expand Down