diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp index 7292bc2be0df2..0ecb90b0e6a41 100644 --- a/llvm/lib/CodeGen/BranchFolding.cpp +++ b/llvm/lib/CodeGen/BranchFolding.cpp @@ -26,6 +26,7 @@ #include "llvm/CodeGen/Analysis.h" #include "llvm/CodeGen/BranchFoldingPass.h" #include "llvm/CodeGen/MBFIWrapper.h" +#include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineBlockFrequencyInfo.h" #include "llvm/CodeGen/MachineBranchProbabilityInfo.h" #include "llvm/CodeGen/MachineFunction.h" @@ -1553,32 +1554,61 @@ bool BranchFolder::OptimizeBlock(MachineBasicBlock *MBB) { MachineInstr &TailCall = *MBB->getFirstNonDebugInstr(); if (TII->isUnconditionalTailCall(TailCall)) { SmallVector PredsChanged; - for (auto &Pred : MBB->predecessors()) { + for (auto *Pred : MBB->predecessors()) { + bool PredHasHotSuccessor = false; + for (MachineBasicBlock *const PredSucc : Pred->successors()) { + PredHasHotSuccessor |= MBPI.isEdgeHot(Pred, PredSucc); + } + MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr; SmallVector PredCond; bool PredAnalyzable = !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); - // Only eliminate if MBB == TBB (Taken Basic Block) - if (PredAnalyzable && !PredCond.empty() && PredTBB == MBB && - PredTBB != PredFBB) { - // The predecessor has a conditional branch to this block which - // consists of only a tail call. Try to fold the tail call into the - // conditional branch. + bool IsEdgeCold = !MBPI.isEdgeHot(Pred, MBB); + bool CanFoldFallThrough = + PredHasHotSuccessor && IsEdgeCold && + (MBB == PredFBB || + (PredFBB == nullptr && Pred->getFallThrough() == MBB)); + bool CanFoldTakenBlock = + (MBB == PredTBB && (PredHasHotSuccessor ? IsEdgeCold : true)); + + // When we have PGO (or equivalent) information, we want to fold the + // fallthrough if it's cold. + // Alternatively if the taken block is cold we want to fold that. + // Folding a block puts it behind a conditional branch which isn't + // desirable if it's hot. + // + // When there isn't any PGO information available we want to fold the + // taken block if it's possible and we never want to fold the + // fallthrough as we don't know if that is desirable. + if (PredAnalyzable && !PredCond.empty() && PredTBB != PredFBB && + (CanFoldTakenBlock || CanFoldFallThrough)) { + if (CanFoldFallThrough) { + SmallVector ReversedCond(PredCond); + DebugLoc Dl = MBB->findBranchDebugLoc(); + TII->reverseBranchCondition(ReversedCond); + + if (!TII->canMakeTailCallConditional(ReversedCond, TailCall)) + continue; + + TII->removeBranch(*Pred); + TII->insertBranch(*Pred, MBB, PredTBB, ReversedCond, Dl); + } + + PredAnalyzable = + !TII->analyzeBranch(*Pred, PredTBB, PredFBB, PredCond, true); + if (TII->canMakeTailCallConditional(PredCond, TailCall)) { - // TODO: It would be nice if analyzeBranch() could provide a pointer - // to the branch instruction so replaceBranchWithTailCall() doesn't - // have to search for it. + // TODO: It would be nice if analyzeBranch() could provide a + // pointer to the branch instruction so + // replaceBranchWithTailCall() doesn't have to search for it. TII->replaceBranchWithTailCall(*Pred, PredCond, TailCall); PredsChanged.push_back(Pred); } } - // If the predecessor is falling through to this block, we could reverse - // the branch condition and fold the tail call into that. However, after - // that we might have to re-arrange the CFG to fall through to the other - // block and there is a high risk of regressing code size rather than - // improving it. } + if (!PredsChanged.empty()) { NumTailCalls += PredsChanged.size(); for (auto &Pred : PredsChanged) diff --git a/llvm/test/CodeGen/X86/conditional-tailcall.ll b/llvm/test/CodeGen/X86/conditional-tailcall.ll index 2859a87db3d56..7233dfd44c605 100644 --- a/llvm/test/CodeGen/X86/conditional-tailcall.ll +++ b/llvm/test/CodeGen/X86/conditional-tailcall.ll @@ -596,3 +596,104 @@ cleanup.thread: ; preds = %cleanup.thread.loop %6 = phi i1 [ %cmp37, %5 ], [ %call34, %if.else28 ], [ false, %cleanup.thread.loopexit ] ret i1 %6 } + +define void @true_likely(i1 noundef zeroext %0) { +; CHECK32-LABEL: true_likely: +; CHECK32: # %bb.0: +; CHECK32-NEXT: cmpb $0, {{[0-9]+}}(%esp) # encoding: [0x80,0x7c,0x24,0x04,0x00] +; CHECK32-NEXT: je func_false # TAILCALL +; CHECK32-NEXT: # encoding: [0x74,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: +; CHECK32-NEXT: jmp func_true # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1 +; +; CHECK64-LABEL: true_likely: +; CHECK64: # %bb.0: +; CHECK64-NEXT: testl %edi, %edi # encoding: [0x85,0xff] +; CHECK64-NEXT: je func_false # TAILCALL +; CHECK64-NEXT: # encoding: [0x74,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: +; CHECK64-NEXT: jmp func_true # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1 +; +; WIN64-LABEL: true_likely: +; WIN64: # %bb.0: +; WIN64-NEXT: testb %cl, %cl # encoding: [0x84,0xc9] +; WIN64-NEXT: je func_false # TAILCALL +; WIN64-NEXT: # encoding: [0x74,A] +; WIN64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: +; WIN64-NEXT: jmp func_true # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1 + br i1 %0, label %2, label %3, !prof !6 + +2: + tail call void @func_true() + br label %4 + +3: + tail call void @func_false() + br label %4 + +4: + ret void +} + +define void @false_likely(i1 noundef zeroext %0) { +; CHECK32-LABEL: false_likely: +; CHECK32: # %bb.0: +; CHECK32-NEXT: cmpb $0, {{[0-9]+}}(%esp) # encoding: [0x80,0x7c,0x24,0x04,0x00] +; CHECK32-NEXT: jne func_true # TAILCALL +; CHECK32-NEXT: # encoding: [0x75,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1 +; CHECK32-NEXT: # %bb.1: +; CHECK32-NEXT: jmp func_false # TAILCALL +; CHECK32-NEXT: # encoding: [0xeb,A] +; CHECK32-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1 +; +; CHECK64-LABEL: false_likely: +; CHECK64: # %bb.0: +; CHECK64-NEXT: testl %edi, %edi # encoding: [0x85,0xff] +; CHECK64-NEXT: jne func_true # TAILCALL +; CHECK64-NEXT: # encoding: [0x75,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1 +; CHECK64-NEXT: # %bb.1: +; CHECK64-NEXT: jmp func_false # TAILCALL +; CHECK64-NEXT: # encoding: [0xeb,A] +; CHECK64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1 +; +; WIN64-LABEL: false_likely: +; WIN64: # %bb.0: +; WIN64-NEXT: testb %cl, %cl # encoding: [0x84,0xc9] +; WIN64-NEXT: jne func_true # TAILCALL +; WIN64-NEXT: # encoding: [0x75,A] +; WIN64-NEXT: # fixup A - offset: 1, value: func_true, kind: FK_PCRel_1 +; WIN64-NEXT: # %bb.1: +; WIN64-NEXT: jmp func_false # TAILCALL +; WIN64-NEXT: # encoding: [0xeb,A] +; WIN64-NEXT: # fixup A - offset: 1, value: func_false, kind: FK_PCRel_1 + br i1 %0, label %2, label %3, !prof !7 + +2: + tail call void @func_true() + br label %4 + +3: + tail call void @func_false() + br label %4 + +4: + ret void +} + +!6 = !{!"branch_weights", !"expected", i32 2000, i32 1} +!7 = !{!"branch_weights", !"expected", i32 1, i32 2000} + + +declare dso_local void @func_true() +declare dso_local void @func_false() diff --git a/llvm/test/CodeGen/X86/segmented-stacks.ll b/llvm/test/CodeGen/X86/segmented-stacks.ll index f8627ff56a1f9..cd406385d6d68 100644 --- a/llvm/test/CodeGen/X86/segmented-stacks.ll +++ b/llvm/test/CodeGen/X86/segmented-stacks.ll @@ -1732,8 +1732,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X86-Linux-LABEL: test_sibling_call_empty_frame: ; X86-Linux: # %bb.0: ; X86-Linux-NEXT: cmpl %gs:48, %esp -; X86-Linux-NEXT: ja callee@PLT # TAILCALL -; X86-Linux-NEXT: # %bb.1: +; X86-Linux-NEXT: jbe .LBB8_1 +; X86-Linux-NEXT: # %bb.2: +; X86-Linux-NEXT: jmp callee@PLT # TAILCALL +; X86-Linux-NEXT: .LBB8_1: ; X86-Linux-NEXT: pushl $4 ; X86-Linux-NEXT: pushl $0 ; X86-Linux-NEXT: calll __morestack @@ -1743,8 +1745,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X64-Linux-LABEL: test_sibling_call_empty_frame: ; X64-Linux: # %bb.0: ; X64-Linux-NEXT: cmpq %fs:112, %rsp -; X64-Linux-NEXT: ja callee@PLT # TAILCALL -; X64-Linux-NEXT: # %bb.1: +; X64-Linux-NEXT: jbe .LBB8_1 +; X64-Linux-NEXT: # %bb.2: +; X64-Linux-NEXT: jmp callee@PLT # TAILCALL +; X64-Linux-NEXT: .LBB8_1: ; X64-Linux-NEXT: movl $0, %r10d ; X64-Linux-NEXT: movl $0, %r11d ; X64-Linux-NEXT: callq __morestack @@ -1769,8 +1773,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X32ABI-LABEL: test_sibling_call_empty_frame: ; X32ABI: # %bb.0: ; X32ABI-NEXT: cmpl %fs:64, %esp -; X32ABI-NEXT: ja callee@PLT # TAILCALL -; X32ABI-NEXT: # %bb.1: +; X32ABI-NEXT: jbe .LBB8_1 +; X32ABI-NEXT: # %bb.2: +; X32ABI-NEXT: jmp callee@PLT # TAILCALL +; X32ABI-NEXT: .LBB8_1: ; X32ABI-NEXT: movl $0, %r10d ; X32ABI-NEXT: movl $0, %r11d ; X32ABI-NEXT: callq __morestack @@ -1781,8 +1787,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X86-Darwin: ## %bb.0: ; X86-Darwin-NEXT: movl $432, %ecx ## imm = 0x1B0 ; X86-Darwin-NEXT: cmpl %gs:(%ecx), %esp -; X86-Darwin-NEXT: ja _callee ## TAILCALL -; X86-Darwin-NEXT: ## %bb.1: +; X86-Darwin-NEXT: jbe LBB8_1 +; X86-Darwin-NEXT: ## %bb.2: +; X86-Darwin-NEXT: jmp _callee ## TAILCALL +; X86-Darwin-NEXT: LBB8_1: ; X86-Darwin-NEXT: pushl $4 ; X86-Darwin-NEXT: pushl $0 ; X86-Darwin-NEXT: calll ___morestack @@ -1792,8 +1800,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X64-Darwin-LABEL: test_sibling_call_empty_frame: ; X64-Darwin: ## %bb.0: ; X64-Darwin-NEXT: cmpq %gs:816, %rsp -; X64-Darwin-NEXT: ja _callee ## TAILCALL -; X64-Darwin-NEXT: ## %bb.1: +; X64-Darwin-NEXT: jbe LBB8_1 +; X64-Darwin-NEXT: ## %bb.2: +; X64-Darwin-NEXT: jmp _callee ## TAILCALL +; X64-Darwin-NEXT: LBB8_1: ; X64-Darwin-NEXT: movl $0, %r10d ; X64-Darwin-NEXT: movl $0, %r11d ; X64-Darwin-NEXT: callq ___morestack @@ -1803,8 +1813,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X86-MinGW-LABEL: test_sibling_call_empty_frame: ; X86-MinGW: # %bb.0: ; X86-MinGW-NEXT: cmpl %fs:20, %esp -; X86-MinGW-NEXT: ja _callee # TAILCALL -; X86-MinGW-NEXT: # %bb.1: +; X86-MinGW-NEXT: jbe LBB8_1 +; X86-MinGW-NEXT: # %bb.2: +; X86-MinGW-NEXT: jmp _callee # TAILCALL +; X86-MinGW-NEXT: LBB8_1: ; X86-MinGW-NEXT: pushl $4 ; X86-MinGW-NEXT: pushl $0 ; X86-MinGW-NEXT: calll ___morestack @@ -1814,8 +1826,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X64-FreeBSD-LABEL: test_sibling_call_empty_frame: ; X64-FreeBSD: # %bb.0: ; X64-FreeBSD-NEXT: cmpq %fs:24, %rsp -; X64-FreeBSD-NEXT: ja callee@PLT # TAILCALL -; X64-FreeBSD-NEXT: # %bb.1: +; X64-FreeBSD-NEXT: jbe .LBB8_1 +; X64-FreeBSD-NEXT: # %bb.2: +; X64-FreeBSD-NEXT: jmp callee@PLT # TAILCALL +; X64-FreeBSD-NEXT: .LBB8_1: ; X64-FreeBSD-NEXT: movl $0, %r10d ; X64-FreeBSD-NEXT: movl $0, %r11d ; X64-FreeBSD-NEXT: callq __morestack @@ -1825,8 +1839,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X86-DFlyBSD-LABEL: test_sibling_call_empty_frame: ; X86-DFlyBSD: # %bb.0: ; X86-DFlyBSD-NEXT: cmpl %fs:16, %esp -; X86-DFlyBSD-NEXT: ja callee@PLT # TAILCALL -; X86-DFlyBSD-NEXT: # %bb.1: +; X86-DFlyBSD-NEXT: jbe .LBB8_1 +; X86-DFlyBSD-NEXT: # %bb.2: +; X86-DFlyBSD-NEXT: jmp callee@PLT # TAILCALL +; X86-DFlyBSD-NEXT: .LBB8_1: ; X86-DFlyBSD-NEXT: pushl $4 ; X86-DFlyBSD-NEXT: pushl $0 ; X86-DFlyBSD-NEXT: calll __morestack @@ -1836,8 +1852,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X64-DFlyBSD-LABEL: test_sibling_call_empty_frame: ; X64-DFlyBSD: # %bb.0: ; X64-DFlyBSD-NEXT: cmpq %fs:32, %rsp -; X64-DFlyBSD-NEXT: ja callee@PLT # TAILCALL -; X64-DFlyBSD-NEXT: # %bb.1: +; X64-DFlyBSD-NEXT: jbe .LBB8_1 +; X64-DFlyBSD-NEXT: # %bb.2: +; X64-DFlyBSD-NEXT: jmp callee@PLT # TAILCALL +; X64-DFlyBSD-NEXT: .LBB8_1: ; X64-DFlyBSD-NEXT: movl $0, %r10d ; X64-DFlyBSD-NEXT: movl $0, %r11d ; X64-DFlyBSD-NEXT: callq __morestack @@ -1847,8 +1865,10 @@ define i32 @test_sibling_call_empty_frame(i32 %x) #0 { ; X64-MinGW-LABEL: test_sibling_call_empty_frame: ; X64-MinGW: # %bb.0: ; X64-MinGW-NEXT: cmpq %gs:40, %rsp -; X64-MinGW-NEXT: ja callee # TAILCALL -; X64-MinGW-NEXT: # %bb.1: +; X64-MinGW-NEXT: jbe .LBB8_1 +; X64-MinGW-NEXT: # %bb.2: +; X64-MinGW-NEXT: jmp callee # TAILCALL +; X64-MinGW-NEXT: .LBB8_1: ; X64-MinGW-NEXT: movl $0, %r10d ; X64-MinGW-NEXT: movl $32, %r11d ; X64-MinGW-NEXT: callq __morestack diff --git a/llvm/test/CodeGen/X86/switch-bt.ll b/llvm/test/CodeGen/X86/switch-bt.ll index 2bf7c46e67e18..b1ec2d59a6a76 100644 --- a/llvm/test/CodeGen/X86/switch-bt.ll +++ b/llvm/test/CodeGen/X86/switch-bt.ll @@ -86,7 +86,9 @@ define void @test2(i32 %x) nounwind ssp { ; CHECK-NEXT: # %bb.1: # %entry ; CHECK-NEXT: movl $91, %eax ; CHECK-NEXT: btl %edi, %eax -; CHECK-NEXT: jb bar@PLT # TAILCALL +; CHECK-NEXT: jae .LBB1_2 +; CHECK-NEXT: # %bb.3: # %if.then +; CHECK-NEXT: jmp bar@PLT # TAILCALL ; CHECK-NEXT: .LBB1_2: # %if.end ; CHECK-NEXT: retq @@ -116,7 +118,9 @@ define void @test3(i32 %x) nounwind { ; CHECK-NEXT: ja .LBB2_2 ; CHECK-NEXT: # %bb.1: ; CHECK-NEXT: cmpl $4, %edi -; CHECK-NEXT: jne bar@PLT # TAILCALL +; CHECK-NEXT: je .LBB2_2 +; CHECK-NEXT: # %bb.3: # %if.then +; CHECK-NEXT: jmp bar@PLT # TAILCALL ; CHECK-NEXT: .LBB2_2: # %if.end ; CHECK-NEXT: retq switch i32 %x, label %if.end [