From aa7076ca0b50e7152bb015bf543e1c182f3c58c3 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 13 Aug 2025 05:16:40 +0000 Subject: [PATCH 1/3] [MCA][X86] Pretend To Have a Stack Engine This patch removes RSP dependencies from push and pop instructions to pretend that we have a stack engine. This does not model details like sync uops that are relevant implementation details due to complexity. This is just enabled on all X86 CPUs given LLVM does not have a scheduling model for any X86 CPU that does not have a stack engine. This fixes #152008. --- .../lib/Target/X86/MCA/X86CustomBehaviour.cpp | 15 +++ llvm/lib/Target/X86/MCA/X86CustomBehaviour.h | 5 + .../tools/llvm-mca/X86/stack-engine-pop.s | 92 +++++++++++++++++++ .../tools/llvm-mca/X86/stack-engine-push.s | 92 +++++++++++++++++++ 4 files changed, 204 insertions(+) create mode 100644 llvm/test/tools/llvm-mca/X86/stack-engine-pop.s create mode 100644 llvm/test/tools/llvm-mca/X86/stack-engine-push.s diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index 817e88d8a0bc0..71cb49330e542 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -36,11 +36,26 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr &Inst, } } +void X86InstrPostProcess::useStackEngine(std::unique_ptr &Inst, + const MCInst &MCI) { + if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) { + auto *StackRegisterDef = + llvm::find_if(Inst->getDefs(), [](const WriteState &State) { + return State.getRegisterID() == X86::RSP; + }); + assert( + StackRegisterDef != Inst->getDefs().end() && + "Expected push instruction to implicitly use stack pointer register."); + Inst->getDefs().erase(StackRegisterDef); + } +} + void X86InstrPostProcess::postProcessInstruction( std::unique_ptr &Inst, const MCInst &MCI) { // Currently, we only modify certain instructions' IsALoadBarrier and // IsAStoreBarrier flags. setMemBarriers(Inst, MCI); + useStackEngine(Inst, MCI); } } // namespace mca diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h index 4a83ba848dd88..c5459e42dfc9f 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h @@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess { /// as load and store barriers. void setMemBarriers(std::unique_ptr &Inst, const MCInst &MCI); + /// Called within X86InstrPostPorcess to remove some rsp read operands + /// on stack instructions to better simulate the stack engine. We currently + /// do not model features of the stack engine like sync uops. + void useStackEngine(std::unique_ptr &Inst, const MCInst &MCI); + public: X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII) : InstrPostProcess(STI, MCII) {} diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s new file mode 100644 index 0000000000000..2ffb52ae61fc4 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s + +movq $0x80, %rsp +popq %rax +popq %rcx +popq %rdx +popq %rbx +popq %r12 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 14 +# CHECK-NEXT: Total uOps: 22 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 1.57 +# CHECK-NEXT: IPC: 0.86 +# CHECK-NEXT: Block RThroughput: 2.5 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 movq $128, %rsp +# CHECK-NEXT: 2 6 0.50 * popq %rax +# CHECK-NEXT: 2 6 0.50 * popq %rcx +# CHECK-NEXT: 2 6 0.50 * popq %rdx +# CHECK-NEXT: 2 6 0.50 * popq %rbx +# CHECK-NEXT: 2 6 0.50 * popq %r12 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SKLDivider +# CHECK-NEXT: [1] - SKLFPDivider +# CHECK-NEXT: [2] - SKLPort0 +# CHECK-NEXT: [3] - SKLPort1 +# CHECK-NEXT: [4] - SKLPort2 +# CHECK-NEXT: [5] - SKLPort3 +# CHECK-NEXT: [6] - SKLPort4 +# CHECK-NEXT: [7] - SKLPort5 +# CHECK-NEXT: [8] - SKLPort6 +# CHECK-NEXT: [9] - SKLPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 1.50 1.50 2.50 2.50 - 1.50 1.50 - + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - - - - - - 0.50 0.50 - movq $128, %rsp +# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rax +# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rcx +# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rdx +# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rbx +# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - popq %r12 + +# CHECK: Timeline view: +# CHECK-NEXT: 0123 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeER . . . movq $128, %rsp +# CHECK-NEXT: [0,1] D=eeeeeeER. . popq %rax +# CHECK-NEXT: [0,2] D=eeeeeeER. . popq %rcx +# CHECK-NEXT: [0,3] .D=eeeeeeER . popq %rdx +# CHECK-NEXT: [0,4] .D=eeeeeeER . popq %rbx +# CHECK-NEXT: [0,5] .D==eeeeeeER . popq %r12 +# CHECK-NEXT: [1,0] . DeE------R . movq $128, %rsp +# CHECK-NEXT: [1,1] . D=eeeeeeER . popq %rax +# CHECK-NEXT: [1,2] . D==eeeeeeER. popq %rcx +# CHECK-NEXT: [1,3] . D=eeeeeeER. popq %rdx +# CHECK-NEXT: [1,4] . D==eeeeeeER popq %rbx +# CHECK-NEXT: [1,5] . D==eeeeeeER popq %r12 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 1.0 3.0 movq $128, %rsp +# CHECK-NEXT: 1. 2 2.0 0.0 0.0 popq %rax +# CHECK-NEXT: 2. 2 2.5 0.5 0.0 popq %rcx +# CHECK-NEXT: 3. 2 2.0 1.0 0.0 popq %rdx +# CHECK-NEXT: 4. 2 2.5 1.5 0.0 popq %rbx +# CHECK-NEXT: 5. 2 3.0 2.0 0.0 popq %r12 +# CHECK-NEXT: 2 2.2 1.0 0.5 diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-push.s b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s new file mode 100644 index 0000000000000..fc394d4c1e7d3 --- /dev/null +++ b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s @@ -0,0 +1,92 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s + +movq $0x80, %rsp +pushq %rax +pushq %rcx +pushq %rdx +pushq %rbx +pushq %r12 + +# CHECK: Iterations: 2 +# CHECK-NEXT: Instructions: 12 +# CHECK-NEXT: Total Cycles: 15 +# CHECK-NEXT: Total uOps: 32 + +# CHECK: Dispatch Width: 6 +# CHECK-NEXT: uOps Per Cycle: 2.13 +# CHECK-NEXT: IPC: 0.80 +# CHECK-NEXT: Block RThroughput: 5.0 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) + +# CHECK: [1] [2] [3] [4] [5] [6] Instructions: +# CHECK-NEXT: 1 1 0.25 movq $128, %rsp +# CHECK-NEXT: 3 2 1.00 * pushq %rax +# CHECK-NEXT: 3 2 1.00 * pushq %rcx +# CHECK-NEXT: 3 2 1.00 * pushq %rdx +# CHECK-NEXT: 3 2 1.00 * pushq %rbx +# CHECK-NEXT: 3 2 1.00 * pushq %r12 + +# CHECK: Resources: +# CHECK-NEXT: [0] - SKLDivider +# CHECK-NEXT: [1] - SKLFPDivider +# CHECK-NEXT: [2] - SKLPort0 +# CHECK-NEXT: [3] - SKLPort1 +# CHECK-NEXT: [4] - SKLPort2 +# CHECK-NEXT: [5] - SKLPort3 +# CHECK-NEXT: [6] - SKLPort4 +# CHECK-NEXT: [7] - SKLPort5 +# CHECK-NEXT: [8] - SKLPort6 +# CHECK-NEXT: [9] - SKLPort7 + +# CHECK: Resource pressure per iteration: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] +# CHECK-NEXT: - - 1.50 1.50 1.50 1.50 5.00 1.50 1.50 2.00 + +# CHECK: Resource pressure by instruction: +# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions: +# CHECK-NEXT: - - - - - - - - 1.00 - movq $128, %rsp +# CHECK-NEXT: - - 0.50 - 0.50 - 1.00 0.50 - 0.50 pushq %rax +# CHECK-NEXT: - - - 0.50 - 0.50 1.00 - 0.50 0.50 pushq %rcx +# CHECK-NEXT: - - 0.50 - 0.50 0.50 1.00 0.50 - - pushq %rdx +# CHECK-NEXT: - - - 0.50 0.50 - 1.00 0.50 - 0.50 pushq %rbx +# CHECK-NEXT: - - 0.50 0.50 - 0.50 1.00 - - 0.50 pushq %r12 + +# CHECK: Timeline view: +# CHECK-NEXT: 01234 +# CHECK-NEXT: Index 0123456789 + +# CHECK: [0,0] DeER . . . movq $128, %rsp +# CHECK-NEXT: [0,1] D=eeER . . pushq %rax +# CHECK-NEXT: [0,2] .D=eeER . . pushq %rcx +# CHECK-NEXT: [0,3] .D==eeER . . pushq %rdx +# CHECK-NEXT: [0,4] . D==eeER . . pushq %rbx +# CHECK-NEXT: [0,5] . D===eeER. . pushq %r12 +# CHECK-NEXT: [1,0] . DeE---R. . movq $128, %rsp +# CHECK-NEXT: [1,1] . D===eeER . pushq %rax +# CHECK-NEXT: [1,2] . D===eeER . pushq %rcx +# CHECK-NEXT: [1,3] . D====eeER . pushq %rdx +# CHECK-NEXT: [1,4] . D====eeER. pushq %rbx +# CHECK-NEXT: [1,5] . D=====eeER pushq %r12 + +# CHECK: Average Wait times (based on the timeline view): +# CHECK-NEXT: [0]: Executions +# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue +# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready +# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage + +# CHECK: [0] [1] [2] [3] +# CHECK-NEXT: 0. 2 1.0 1.0 1.5 movq $128, %rsp +# CHECK-NEXT: 1. 2 3.0 0.5 0.0 pushq %rax +# CHECK-NEXT: 2. 2 3.0 1.0 0.0 pushq %rcx +# CHECK-NEXT: 3. 2 4.0 1.0 0.0 pushq %rdx +# CHECK-NEXT: 4. 2 4.0 1.0 0.0 pushq %rbx +# CHECK-NEXT: 5. 2 5.0 1.0 0.0 pushq %r12 +# CHECK-NEXT: 2 3.3 0.9 0.3 From 2acc2b527d7d34f838e7f88c01c81608092e4bc2 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 14 Aug 2025 16:26:52 +0000 Subject: [PATCH 2/3] add todo --- llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index 71cb49330e542..4fcf263d204c8 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -38,6 +38,9 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr &Inst, void X86InstrPostProcess::useStackEngine(std::unique_ptr &Inst, const MCInst &MCI) { + // TODO(boomanaiden154): We currently do not handle PUSHF/POPF because we + // have not done the necessary benchmarking to see if they are also + // optimized by the stack engine. if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) { auto *StackRegisterDef = llvm::find_if(Inst->getDefs(), [](const WriteState &State) { From a5a592dee4c760d34e93928d76fd1b733e790c5f Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Thu, 14 Aug 2025 22:37:44 +0000 Subject: [PATCH 3/3] Feedback --- llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp index 4fcf263d204c8..aa2027a13da2a 100644 --- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp +++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp @@ -55,8 +55,7 @@ void X86InstrPostProcess::useStackEngine(std::unique_ptr &Inst, void X86InstrPostProcess::postProcessInstruction( std::unique_ptr &Inst, const MCInst &MCI) { - // Currently, we only modify certain instructions' IsALoadBarrier and - // IsAStoreBarrier flags. + // Set IsALoadBarrier and IsAStoreBarrier flags. setMemBarriers(Inst, MCI); useStackEngine(Inst, MCI); }