diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index 817e88d8a0bc0..aa2027a13da2a 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -36,11 +36,28 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr &Inst, } } +void X86InstrPostProcess::useStackEngine(std::unique_ptr &Inst, + const MCInst &MCI) { + // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we + // have not done the necessary benchmarking to see if they are also + // optimized by the stack engine. + if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) { + auto *StackRegisterDef = + llvm::find_if(Inst->getDefs(), [](const WriteState &State) { + return State.getRegisterID() == X86::RSP; + }); + assert( + StackRegisterDef != Inst->getDefs().end() && + "Expected push instruction to implicitly use stack pointer register."); + Inst->getDefs().erase(StackRegisterDef); + } +} + void X86InstrPostProcess::postProcessInstruction( std::unique_ptr &Inst, const MCInst &MCI) { - // Currently, we only modify certain instructions' IsALoadBarrier and - // IsAStoreBarrier flags. + // Set IsALoadBarrier and IsAStoreBarrier flags. setMemBarriers(Inst, MCI); + useStackEngine(Inst, MCI); } } // namespace mca diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index 4a83ba848dd88..c5459e42dfc9f 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess { /// as load and store barriers. void setMemBarriers(std::unique_ptr &Inst, const MCInst &MCI); + /// Called within X86InstrPostPorcess to remove some rsp read operands + /// on stack instructions to better simulate the stack engine. We currently + /// do not model features of the stack engine like sync uops. + void useStackEngine(std::unique_ptr &Inst, const MCInst &MCI); + public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s new file mode 100644 index 0000000000000..2ffb52ae61fc4 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s + +movq $0x80, %rsp +popq %rax +popq %rcx +popq %rdx +popq %rbx +popq %r12 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 14 +# CHECK-NEXT: Total uOps: 22 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 1.57 +# CHECK-NEXT: IPC: 0.86 +# CHECK-NEXT: Block RThroughput: 2.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 movq $128, %rsp +# CHECK-NEXT: 2 6 0.50 * popq %rax +# CHECK-NEXT: 2 6 0.50 * popq %rcx +# CHECK-NEXT: 2 6 0.50 * popq %rdx +# CHECK-NEXT: 2 6 0.50 * popq %rbx +# CHECK-NEXT: 2 6 0.50 * popq %r12 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SKLDivider +# CHECK-NEXT: [1] - SKLFPDivider +# CHECK-NEXT: [2] - SKLPort0 +# CHECK-NEXT: [3] - SKLPort1 +# CHECK-NEXT: [4] - SKLPort2 +# CHECK-NEXT: [5] - SKLPort3 +# CHECK-NEXT: [6] - SKLPort4 +# CHECK-NEXT: [7] - SKLPort5 +# CHECK-NEXT: [8] - SKLPort6 +# CHECK-NEXT: [9] - SKLPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 1.50 1.50 2.50 2.50 - 1.50 1.50 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - - - - - - 0.50 0.50 - movq $128, %rsp +# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rax +# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rcx +# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rdx +# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rbx +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - popq %r12 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeER . . . movq $128, %rsp +# CHECK-NEXT: [0,1] D=eeeeeeER. . popq %rax +# CHECK-NEXT: [0,2] D=eeeeeeER. . popq %rcx +# CHECK-NEXT: [0,3] .D=eeeeeeER . popq %rdx +# CHECK-NEXT: [0,4] .D=eeeeeeER . popq %rbx +# CHECK-NEXT: [0,5] .D==eeeeeeER . popq %r12 +# CHECK-NEXT: [1,0] . DeE------R . movq $128, %rsp +# CHECK-NEXT: [1,1] . D=eeeeeeER . popq %rax +# CHECK-NEXT: [1,2] . D==eeeeeeER. popq %rcx +# CHECK-NEXT: [1,3] . D=eeeeeeER. popq %rdx +# CHECK-NEXT: [1,4] . D==eeeeeeER popq %rbx +# CHECK-NEXT: [1,5] . D==eeeeeeER popq %r12 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 1.0 3.0 movq $128, %rsp +# CHECK-NEXT: 1. 2 2.0 0.0 0.0 popq %rax +# CHECK-NEXT: 2. 2 2.5 0.5 0.0 popq %rcx +# CHECK-NEXT: 3. 2 2.0 1.0 0.0 popq %rdx +# CHECK-NEXT: 4. 2 2.5 1.5 0.0 popq %rbx +# CHECK-NEXT: 5. 2 3.0 2.0 0.0 popq %r12 +# CHECK-NEXT: 2 2.2 1.0 0.5 diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-push.s b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s new file mode 100644 index 0000000000000..fc394d4c1e7d3 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s + +movq $0x80, %rsp +pushq %rax +pushq %rcx +pushq %rdx +pushq %rbx +pushq %r12 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 15 +# CHECK-NEXT: Total uOps: 32 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 2.13 +# CHECK-NEXT: IPC: 0.80 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 movq $128, %rsp +# CHECK-NEXT: 3 2 1.00 * pushq %rax +# CHECK-NEXT: 3 2 1.00 * pushq %rcx +# CHECK-NEXT: 3 2 1.00 * pushq %rdx +# CHECK-NEXT: 3 2 1.00 * pushq %rbx +# CHECK-NEXT: 3 2 1.00 * pushq %r12 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SKLDivider +# CHECK-NEXT: [1] - SKLFPDivider +# CHECK-NEXT: [2] - SKLPort0 +# CHECK-NEXT: [3] - SKLPort1 +# CHECK-NEXT: [4] - SKLPort2 +# CHECK-NEXT: [5] - SKLPort3 +# CHECK-NEXT: [6] - SKLPort4 +# CHECK-NEXT: [7] - SKLPort5 +# CHECK-NEXT: [8] - SKLPort6 +# CHECK-NEXT: [9] - SKLPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 1.50 1.50 1.50 1.50 5.00 1.50 1.50 2.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - - - - - - - 1.00 - movq $128, %rsp +# CHECK-NEXT: - - 0.50 - 0.50 - 1.00 0.50 - 0.50 pushq %rax +# CHECK-NEXT: - - - 0.50 - 0.50 1.00 - 0.50 0.50 pushq %rcx +# CHECK-NEXT: - - 0.50 - 0.50 0.50 1.00 0.50 - - pushq %rdx +# CHECK-NEXT: - - - 0.50 0.50 - 1.00 0.50 - 0.50 pushq %rbx +# CHECK-NEXT: - - 0.50 0.50 - 0.50 1.00 - - 0.50 pushq %r12 + +# CHECK: Timeline view: +# CHECK-NEXT: 01234 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeER . . . movq $128, %rsp +# CHECK-NEXT: [0,1] D=eeER . . pushq %rax +# CHECK-NEXT: [0,2] .D=eeER . . pushq %rcx +# CHECK-NEXT: [0,3] .D==eeER . . pushq %rdx +# CHECK-NEXT: [0,4] . D==eeER . . pushq %rbx +# CHECK-NEXT: [0,5] . D===eeER. . pushq %r12 +# CHECK-NEXT: [1,0] . DeE---R. . movq $128, %rsp +# CHECK-NEXT: [1,1] . D===eeER . pushq %rax +# CHECK-NEXT: [1,2] . D===eeER . pushq %rcx +# CHECK-NEXT: [1,3] . D====eeER . pushq %rdx +# CHECK-NEXT: [1,4] . D====eeER. pushq %rbx +# CHECK-NEXT: [1,5] . D=====eeER pushq %r12 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 1.0 1.5 movq $128, %rsp +# CHECK-NEXT: 1. 2 3.0 0.5 0.0 pushq %rax +# CHECK-NEXT: 2. 2 3.0 1.0 0.0 pushq %rcx +# CHECK-NEXT: 3. 2 4.0 1.0 0.0 pushq %rdx +# CHECK-NEXT: 4. 2 4.0 1.0 0.0 pushq %rbx +# CHECK-NEXT: 5. 2 5.0 1.0 0.0 pushq %r12 +# CHECK-NEXT: 2 3.3 0.9 0.3