llvm · boomanaiden154 · Aug 13, 2025 · adibiagio · Aug 13, 2025 · adibiagio
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp
@@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
   }
 }
 
+void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
+                                         const MCInst &MCI) {
+  if (X86::isPOP(MCI.getOpcode())) {
+    assert(Inst->getUses().size() == 1 &&
+           "Expected pop instruction to only use stack pointer register");
+    Inst->getUses().clear();
+  }
+  if (X86::isPUSH(MCI.getOpcode())) {
+    auto *StackRegisterUse =
+        llvm::find_if(Inst->getUses(), [](const ReadState &State) {
+          return State.getRegisterID() == X86::RSP;
+        });
+    assert(
+        StackRegisterUse != Inst->getUses().end() &&
+        "Expected push instruction to implicitly use stack pointer register.");
+    Inst->getUses().erase(StackRegisterUse);
+  }
+}
+
 void X86InstrPostProcess::postProcessInstruction(
     std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
   // Currently, we only modify certain instructions' IsALoadBarrier and
   // IsAStoreBarrier flags.
   setMemBarriers(Inst, MCI);
+  useStackEngine(Inst, MCI);
 }
 
 } // namespace mca

diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
   /// as load and store barriers.
   void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
 
+  /// Called within X86InstrPostPorcess to remove some rsp read operands
+  /// on stack instructions to better simulate the stack engine. We currently
+  /// do not model features of the stack engine like sync uops.
+  void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
+
 public:
   X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}

diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s b/llvm/test/tools/llvm-mca/X86/stack-engine-pop.s
@@ -0,0 +1,81 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=1 < %s | FileCheck %s
+
+popq %rax
+popq %rcx
+popq %rdx
+popq %rbx
+popq %r12
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      5
+# CHECK-NEXT: Total Cycles:      11
+# CHECK-NEXT: Total uOps:        10
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    0.91
+# CHECK-NEXT: IPC:               0.45
+# CHECK-NEXT: Block RThroughput: 2.5
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rax
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rcx
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rdx
+# CHECK-NEXT:  2      6     0.50    *                   popq	%rbx
+# CHECK-NEXT:  2      6     0.50    *                   popq	%r12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKLDivider
+# CHECK-NEXT: [1]   - SKLFPDivider
+# CHECK-NEXT: [2]   - SKLPort0
+# CHECK-NEXT: [3]   - SKLPort1
+# CHECK-NEXT: [4]   - SKLPort2
+# CHECK-NEXT: [5]   - SKLPort3
+# CHECK-NEXT: [6]   - SKLPort4
+# CHECK-NEXT: [7]   - SKLPort5
+# CHECK-NEXT: [8]   - SKLPort6
+# CHECK-NEXT: [9]   - SKLPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     1.00   1.00   2.00   3.00    -     1.00   2.00    -
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00    -     popq	%rax
+# CHECK-NEXT:  -      -      -      -     1.00    -      -     1.00    -      -     popq	%rcx
+# CHECK-NEXT:  -      -      -     1.00    -     1.00    -      -      -      -     popq	%rdx
+# CHECK-NEXT:  -      -     1.00    -     1.00    -      -      -      -      -     popq	%rbx
+# CHECK-NEXT:  -      -      -      -      -     1.00    -      -     1.00    -     popq	%r12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT:                     0
+# CHECK-NEXT: Index     0123456789
+
+# CHECK:      [0,0]     DeeeeeeER .   popq	%rax
+# CHECK-NEXT: [0,1]     DeeeeeeER .   popq	%rcx
+# CHECK-NEXT: [0,2]     D=eeeeeeER.   popq	%rdx
+# CHECK-NEXT: [0,3]     .DeeeeeeER.   popq	%rbx
+# CHECK-NEXT: [0,4]     .D=eeeeeeER   popq	%r12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       popq	%rax
+# CHECK-NEXT: 1.     1     1.0    1.0    0.0       popq	%rcx
+# CHECK-NEXT: 2.     1     2.0    2.0    0.0       popq	%rdx
+# CHECK-NEXT: 3.     1     1.0    1.0    0.0       popq	%rbx
+# CHECK-NEXT: 4.     1     2.0    2.0    0.0       popq	%r12
+# CHECK-NEXT:        1     1.4    1.4    0.0       <total>
diff --git a/llvm/test/tools/llvm-mca/X86/stack-engine-push.s b/llvm/test/tools/llvm-mca/X86/stack-engine-push.s
@@ -0,0 +1,80 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=1 < %s | FileCheck %s
+
+pushq %rax
+pushq %rcx
+pushq %rdx
+pushq %rbx
+pushq %r12
+
+# CHECK:      Iterations:        1
+# CHECK-NEXT: Instructions:      5
+# CHECK-NEXT: Total Cycles:      9
+# CHECK-NEXT: Total uOps:        15
+
+# CHECK:      Dispatch Width:    6
+# CHECK-NEXT: uOps Per Cycle:    1.67
+# CHECK-NEXT: IPC:               0.56
+# CHECK-NEXT: Block RThroughput: 5.0
+
+# CHECK:      Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+
+# CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rax
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rcx
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rdx
+# CHECK-NEXT:  3      2     1.00           *            pushq	%rbx
+# CHECK-NEXT:  3      2     1.00           *            pushq	%r12
+
+# CHECK:      Resources:
+# CHECK-NEXT: [0]   - SKLDivider
+# CHECK-NEXT: [1]   - SKLFPDivider
+# CHECK-NEXT: [2]   - SKLPort0
+# CHECK-NEXT: [3]   - SKLPort1
+# CHECK-NEXT: [4]   - SKLPort2
+# CHECK-NEXT: [5]   - SKLPort3
+# CHECK-NEXT: [6]   - SKLPort4
+# CHECK-NEXT: [7]   - SKLPort5
+# CHECK-NEXT: [8]   - SKLPort6
+# CHECK-NEXT: [9]   - SKLPort7
+
+# CHECK:      Resource pressure per iteration:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]
+# CHECK-NEXT:  -      -     1.00   1.00   1.00   2.00   5.00   1.00   2.00   2.00
+
+# CHECK:      Resource pressure by instruction:
+# CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    Instructions:
+# CHECK-NEXT:  -      -      -      -      -      -     1.00    -     1.00   1.00   pushq	%rax
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00   1.00    -      -     pushq	%rcx
+# CHECK-NEXT:  -      -      -     1.00   1.00    -     1.00    -      -      -     pushq	%rdx
+# CHECK-NEXT:  -      -     1.00    -      -      -     1.00    -      -     1.00   pushq	%rbx
+# CHECK-NEXT:  -      -      -      -      -     1.00   1.00    -     1.00    -     pushq	%r12
+
+# CHECK:      Timeline view:
+# CHECK-NEXT: Index     012345678
+
+# CHECK:      [0,0]     DeeER.  .   pushq	%rax
+# CHECK-NEXT: [0,1]     D=eeER  .   pushq	%rcx
+# CHECK-NEXT: [0,2]     .D=eeER .   pushq	%rdx
+# CHECK-NEXT: [0,3]     .D==eeER.   pushq	%rbx
+# CHECK-NEXT: [0,4]     . D==eeER   pushq	%r12
+
+# CHECK:      Average Wait times (based on the timeline view):
+# CHECK-NEXT: [0]: Executions
+# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
+# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
+# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
+
+# CHECK:            [0]    [1]    [2]    [3]
+# CHECK-NEXT: 0.     1     1.0    1.0    0.0       pushq	%rax
+# CHECK-NEXT: 1.     1     2.0    1.0    0.0       pushq	%rcx
+# CHECK-NEXT: 2.     1     2.0    1.0    0.0       pushq	%rdx
+# CHECK-NEXT: 3.     1     3.0    1.0    0.0       pushq	%rbx
+# CHECK-NEXT: 4.     1     3.0    1.0    0.0       pushq	%r12
+# CHECK-NEXT:        1     2.2    1.0    0.0       <total>