Skip to content

Commit b78c89e

Browse files
[MCA][X86] Pretend To Have a Stack Engine
This patch removes RSP dependencies from push and pop instructions to pretend that we have a stack engine. This does not model details like sync uops that are relevant implementation details due to complexity. This is just enabled on all X86 CPUs given LLVM does not have a scheduling model for any X86 CPU that does not have a stack engine. This fixes #152008.
1 parent 91de0a2 commit b78c89e

File tree

4 files changed

+186
-0
lines changed

4 files changed

+186
-0
lines changed

llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,31 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
3636
}
3737
}
3838

39+
void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
40+
const MCInst &MCI) {
41+
if (X86::isPOP(MCI.getOpcode())) {
42+
assert(Inst->getUses().size() == 1 &&
43+
"Expected pop instruction to only use stack pointer register");
44+
Inst->getUses().clear();
45+
}
46+
if (X86::isPUSH(MCI.getOpcode())) {
47+
auto *StackRegisterUse =
48+
llvm::find_if(Inst->getUses(), [](const ReadState &State) {
49+
return State.getRegisterID() == X86::RSP;
50+
});
51+
assert(
52+
StackRegisterUse != Inst->getUses().end() &&
53+
"Expected push instruction to implicitly use stack pointer register.");
54+
Inst->getUses().erase(StackRegisterUse);
55+
}
56+
}
57+
3958
void X86InstrPostProcess::postProcessInstruction(
4059
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
4160
// Currently, we only modify certain instructions' IsALoadBarrier and
4261
// IsAStoreBarrier flags.
4362
setMemBarriers(Inst, MCI);
63+
useStackEngine(Inst, MCI);
4464
}
4565

4666
} // namespace mca

llvm/lib/Target/X86/MCA/X86CustomBehaviour.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
2828
/// as load and store barriers.
2929
void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
3030

31+
/// Called within X86InstrPostPorcess to remove some rsp read operands
32+
/// on stack instructions to better simulate the stack engine. We currently
33+
/// do not model features of the stack engine like sync uops.
34+
void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
35+
3136
public:
3237
X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
3338
: InstrPostProcess(STI, MCII) {}
Lines changed: 81 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,81 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=1 < %s | FileCheck %s
3+
4+
popq %rax
5+
popq %rcx
6+
popq %rdx
7+
popq %rbx
8+
popq %r12
9+
10+
# CHECK: Iterations: 1
11+
# CHECK-NEXT: Instructions: 5
12+
# CHECK-NEXT: Total Cycles: 11
13+
# CHECK-NEXT: Total uOps: 10
14+
15+
# CHECK: Dispatch Width: 6
16+
# CHECK-NEXT: uOps Per Cycle: 0.91
17+
# CHECK-NEXT: IPC: 0.45
18+
# CHECK-NEXT: Block RThroughput: 2.5
19+
20+
# CHECK: Instruction Info:
21+
# CHECK-NEXT: [1]: #uOps
22+
# CHECK-NEXT: [2]: Latency
23+
# CHECK-NEXT: [3]: RThroughput
24+
# CHECK-NEXT: [4]: MayLoad
25+
# CHECK-NEXT: [5]: MayStore
26+
# CHECK-NEXT: [6]: HasSideEffects (U)
27+
28+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
29+
# CHECK-NEXT: 2 6 0.50 * popq %rax
30+
# CHECK-NEXT: 2 6 0.50 * popq %rcx
31+
# CHECK-NEXT: 2 6 0.50 * popq %rdx
32+
# CHECK-NEXT: 2 6 0.50 * popq %rbx
33+
# CHECK-NEXT: 2 6 0.50 * popq %r12
34+
35+
# CHECK: Resources:
36+
# CHECK-NEXT: [0] - SKLDivider
37+
# CHECK-NEXT: [1] - SKLFPDivider
38+
# CHECK-NEXT: [2] - SKLPort0
39+
# CHECK-NEXT: [3] - SKLPort1
40+
# CHECK-NEXT: [4] - SKLPort2
41+
# CHECK-NEXT: [5] - SKLPort3
42+
# CHECK-NEXT: [6] - SKLPort4
43+
# CHECK-NEXT: [7] - SKLPort5
44+
# CHECK-NEXT: [8] - SKLPort6
45+
# CHECK-NEXT: [9] - SKLPort7
46+
47+
# CHECK: Resource pressure per iteration:
48+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
49+
# CHECK-NEXT: - - 1.00 1.00 2.00 3.00 - 1.00 2.00 -
50+
51+
# CHECK: Resource pressure by instruction:
52+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
53+
# CHECK-NEXT: - - - - - 1.00 - - 1.00 - popq %rax
54+
# CHECK-NEXT: - - - - 1.00 - - 1.00 - - popq %rcx
55+
# CHECK-NEXT: - - - 1.00 - 1.00 - - - - popq %rdx
56+
# CHECK-NEXT: - - 1.00 - 1.00 - - - - - popq %rbx
57+
# CHECK-NEXT: - - - - - 1.00 - - 1.00 - popq %r12
58+
59+
# CHECK: Timeline view:
60+
# CHECK-NEXT: 0
61+
# CHECK-NEXT: Index 0123456789
62+
63+
# CHECK: [0,0] DeeeeeeER . popq %rax
64+
# CHECK-NEXT: [0,1] DeeeeeeER . popq %rcx
65+
# CHECK-NEXT: [0,2] D=eeeeeeER. popq %rdx
66+
# CHECK-NEXT: [0,3] .DeeeeeeER. popq %rbx
67+
# CHECK-NEXT: [0,4] .D=eeeeeeER popq %r12
68+
69+
# CHECK: Average Wait times (based on the timeline view):
70+
# CHECK-NEXT: [0]: Executions
71+
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
72+
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
73+
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
74+
75+
# CHECK: [0] [1] [2] [3]
76+
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 popq %rax
77+
# CHECK-NEXT: 1. 1 1.0 1.0 0.0 popq %rcx
78+
# CHECK-NEXT: 2. 1 2.0 2.0 0.0 popq %rdx
79+
# CHECK-NEXT: 3. 1 1.0 1.0 0.0 popq %rbx
80+
# CHECK-NEXT: 4. 1 2.0 2.0 0.0 popq %r12
81+
# CHECK-NEXT: 1 1.4 1.4 0.0 <total>
Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=1 < %s | FileCheck %s
3+
4+
pushq %rax
5+
pushq %rcx
6+
pushq %rdx
7+
pushq %rbx
8+
pushq %r12
9+
10+
# CHECK: Iterations: 1
11+
# CHECK-NEXT: Instructions: 5
12+
# CHECK-NEXT: Total Cycles: 9
13+
# CHECK-NEXT: Total uOps: 15
14+
15+
# CHECK: Dispatch Width: 6
16+
# CHECK-NEXT: uOps Per Cycle: 1.67
17+
# CHECK-NEXT: IPC: 0.56
18+
# CHECK-NEXT: Block RThroughput: 5.0
19+
20+
# CHECK: Instruction Info:
21+
# CHECK-NEXT: [1]: #uOps
22+
# CHECK-NEXT: [2]: Latency
23+
# CHECK-NEXT: [3]: RThroughput
24+
# CHECK-NEXT: [4]: MayLoad
25+
# CHECK-NEXT: [5]: MayStore
26+
# CHECK-NEXT: [6]: HasSideEffects (U)
27+
28+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
29+
# CHECK-NEXT: 3 2 1.00 * pushq %rax
30+
# CHECK-NEXT: 3 2 1.00 * pushq %rcx
31+
# CHECK-NEXT: 3 2 1.00 * pushq %rdx
32+
# CHECK-NEXT: 3 2 1.00 * pushq %rbx
33+
# CHECK-NEXT: 3 2 1.00 * pushq %r12
34+
35+
# CHECK: Resources:
36+
# CHECK-NEXT: [0] - SKLDivider
37+
# CHECK-NEXT: [1] - SKLFPDivider
38+
# CHECK-NEXT: [2] - SKLPort0
39+
# CHECK-NEXT: [3] - SKLPort1
40+
# CHECK-NEXT: [4] - SKLPort2
41+
# CHECK-NEXT: [5] - SKLPort3
42+
# CHECK-NEXT: [6] - SKLPort4
43+
# CHECK-NEXT: [7] - SKLPort5
44+
# CHECK-NEXT: [8] - SKLPort6
45+
# CHECK-NEXT: [9] - SKLPort7
46+
47+
# CHECK: Resource pressure per iteration:
48+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
49+
# CHECK-NEXT: - - 1.00 1.00 1.00 2.00 5.00 1.00 2.00 2.00
50+
51+
# CHECK: Resource pressure by instruction:
52+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
53+
# CHECK-NEXT: - - - - - - 1.00 - 1.00 1.00 pushq %rax
54+
# CHECK-NEXT: - - - - - 1.00 1.00 1.00 - - pushq %rcx
55+
# CHECK-NEXT: - - - 1.00 1.00 - 1.00 - - - pushq %rdx
56+
# CHECK-NEXT: - - 1.00 - - - 1.00 - - 1.00 pushq %rbx
57+
# CHECK-NEXT: - - - - - 1.00 1.00 - 1.00 - pushq %r12
58+
59+
# CHECK: Timeline view:
60+
# CHECK-NEXT: Index 012345678
61+
62+
# CHECK: [0,0] DeeER. . pushq %rax
63+
# CHECK-NEXT: [0,1] D=eeER . pushq %rcx
64+
# CHECK-NEXT: [0,2] .D=eeER . pushq %rdx
65+
# CHECK-NEXT: [0,3] .D==eeER. pushq %rbx
66+
# CHECK-NEXT: [0,4] . D==eeER pushq %r12
67+
68+
# CHECK: Average Wait times (based on the timeline view):
69+
# CHECK-NEXT: [0]: Executions
70+
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
71+
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
72+
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
73+
74+
# CHECK: [0] [1] [2] [3]
75+
# CHECK-NEXT: 0. 1 1.0 1.0 0.0 pushq %rax
76+
# CHECK-NEXT: 1. 1 2.0 1.0 0.0 pushq %rcx
77+
# CHECK-NEXT: 2. 1 2.0 1.0 0.0 pushq %rdx
78+
# CHECK-NEXT: 3. 1 3.0 1.0 0.0 pushq %rbx
79+
# CHECK-NEXT: 4. 1 3.0 1.0 0.0 pushq %r12
80+
# CHECK-NEXT: 1 2.2 1.0 0.0 <total>

0 commit comments

Comments
 (0)