Skip to content

Commit aa7076c

Browse files
[MCA][X86] Pretend To Have a Stack Engine
This patch removes RSP dependencies from push and pop instructions to pretend that we have a stack engine. This does not model details like sync uops that are relevant implementation details due to complexity. This is just enabled on all X86 CPUs given LLVM does not have a scheduling model for any X86 CPU that does not have a stack engine. This fixes #152008.
1 parent 8cc22ee commit aa7076c

File tree

4 files changed

+204
-0
lines changed

4 files changed

+204
-0
lines changed

llvm/lib/Target/X86/MCA/X86CustomBehaviour.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,11 +36,26 @@ void X86InstrPostProcess::setMemBarriers(std::unique_ptr<Instruction> &Inst,
3636
}
3737
}
3838

39+
void X86InstrPostProcess::useStackEngine(std::unique_ptr<Instruction> &Inst,
40+
const MCInst &MCI) {
41+
if (X86::isPOP(MCI.getOpcode()) || X86::isPUSH(MCI.getOpcode())) {
42+
auto *StackRegisterDef =
43+
llvm::find_if(Inst->getDefs(), [](const WriteState &State) {
44+
return State.getRegisterID() == X86::RSP;
45+
});
46+
assert(
47+
StackRegisterDef != Inst->getDefs().end() &&
48+
"Expected push instruction to implicitly use stack pointer register.");
49+
Inst->getDefs().erase(StackRegisterDef);
50+
}
51+
}
52+
3953
void X86InstrPostProcess::postProcessInstruction(
4054
std::unique_ptr<Instruction> &Inst, const MCInst &MCI) {
4155
// Currently, we only modify certain instructions' IsALoadBarrier and
4256
// IsAStoreBarrier flags.
4357
setMemBarriers(Inst, MCI);
58+
useStackEngine(Inst, MCI);
4459
}
4560

4661
} // namespace mca

llvm/lib/Target/X86/MCA/X86CustomBehaviour.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,11 @@ class X86InstrPostProcess : public InstrPostProcess {
2828
/// as load and store barriers.
2929
void setMemBarriers(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
3030

31+
/// Called within X86InstrPostPorcess to remove some rsp read operands
32+
/// on stack instructions to better simulate the stack engine. We currently
33+
/// do not model features of the stack engine like sync uops.
34+
void useStackEngine(std::unique_ptr<Instruction> &Inst, const MCInst &MCI);
35+
3136
public:
3237
X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
3338
: InstrPostProcess(STI, MCII) {}
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s
3+
4+
movq $0x80, %rsp
5+
popq %rax
6+
popq %rcx
7+
popq %rdx
8+
popq %rbx
9+
popq %r12
10+
11+
# CHECK: Iterations: 2
12+
# CHECK-NEXT: Instructions: 12
13+
# CHECK-NEXT: Total Cycles: 14
14+
# CHECK-NEXT: Total uOps: 22
15+
16+
# CHECK: Dispatch Width: 6
17+
# CHECK-NEXT: uOps Per Cycle: 1.57
18+
# CHECK-NEXT: IPC: 0.86
19+
# CHECK-NEXT: Block RThroughput: 2.5
20+
21+
# CHECK: Instruction Info:
22+
# CHECK-NEXT: [1]: #uOps
23+
# CHECK-NEXT: [2]: Latency
24+
# CHECK-NEXT: [3]: RThroughput
25+
# CHECK-NEXT: [4]: MayLoad
26+
# CHECK-NEXT: [5]: MayStore
27+
# CHECK-NEXT: [6]: HasSideEffects (U)
28+
29+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
30+
# CHECK-NEXT: 1 1 0.25 movq $128, %rsp
31+
# CHECK-NEXT: 2 6 0.50 * popq %rax
32+
# CHECK-NEXT: 2 6 0.50 * popq %rcx
33+
# CHECK-NEXT: 2 6 0.50 * popq %rdx
34+
# CHECK-NEXT: 2 6 0.50 * popq %rbx
35+
# CHECK-NEXT: 2 6 0.50 * popq %r12
36+
37+
# CHECK: Resources:
38+
# CHECK-NEXT: [0] - SKLDivider
39+
# CHECK-NEXT: [1] - SKLFPDivider
40+
# CHECK-NEXT: [2] - SKLPort0
41+
# CHECK-NEXT: [3] - SKLPort1
42+
# CHECK-NEXT: [4] - SKLPort2
43+
# CHECK-NEXT: [5] - SKLPort3
44+
# CHECK-NEXT: [6] - SKLPort4
45+
# CHECK-NEXT: [7] - SKLPort5
46+
# CHECK-NEXT: [8] - SKLPort6
47+
# CHECK-NEXT: [9] - SKLPort7
48+
49+
# CHECK: Resource pressure per iteration:
50+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
51+
# CHECK-NEXT: - - 1.50 1.50 2.50 2.50 - 1.50 1.50 -
52+
53+
# CHECK: Resource pressure by instruction:
54+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
55+
# CHECK-NEXT: - - - - - - - 0.50 0.50 - movq $128, %rsp
56+
# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rax
57+
# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rcx
58+
# CHECK-NEXT: - - 0.50 - 0.50 0.50 - 0.50 - - popq %rdx
59+
# CHECK-NEXT: - - - 0.50 0.50 0.50 - - 0.50 - popq %rbx
60+
# CHECK-NEXT: - - 0.50 0.50 0.50 0.50 - - - - popq %r12
61+
62+
# CHECK: Timeline view:
63+
# CHECK-NEXT: 0123
64+
# CHECK-NEXT: Index 0123456789
65+
66+
# CHECK: [0,0] DeER . . . movq $128, %rsp
67+
# CHECK-NEXT: [0,1] D=eeeeeeER. . popq %rax
68+
# CHECK-NEXT: [0,2] D=eeeeeeER. . popq %rcx
69+
# CHECK-NEXT: [0,3] .D=eeeeeeER . popq %rdx
70+
# CHECK-NEXT: [0,4] .D=eeeeeeER . popq %rbx
71+
# CHECK-NEXT: [0,5] .D==eeeeeeER . popq %r12
72+
# CHECK-NEXT: [1,0] . DeE------R . movq $128, %rsp
73+
# CHECK-NEXT: [1,1] . D=eeeeeeER . popq %rax
74+
# CHECK-NEXT: [1,2] . D==eeeeeeER. popq %rcx
75+
# CHECK-NEXT: [1,3] . D=eeeeeeER. popq %rdx
76+
# CHECK-NEXT: [1,4] . D==eeeeeeER popq %rbx
77+
# CHECK-NEXT: [1,5] . D==eeeeeeER popq %r12
78+
79+
# CHECK: Average Wait times (based on the timeline view):
80+
# CHECK-NEXT: [0]: Executions
81+
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
82+
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
83+
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
84+
85+
# CHECK: [0] [1] [2] [3]
86+
# CHECK-NEXT: 0. 2 1.0 1.0 3.0 movq $128, %rsp
87+
# CHECK-NEXT: 1. 2 2.0 0.0 0.0 popq %rax
88+
# CHECK-NEXT: 2. 2 2.5 0.5 0.0 popq %rcx
89+
# CHECK-NEXT: 3. 2 2.0 1.0 0.0 popq %rdx
90+
# CHECK-NEXT: 4. 2 2.5 1.5 0.0 popq %rbx
91+
# CHECK-NEXT: 5. 2 3.0 2.0 0.0 popq %r12
92+
# CHECK-NEXT: 2 2.2 1.0 0.5 <total>
Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
2+
# RUN: llvm-mca -mtriple=x86_64-unknown-unknown -mcpu=skylake -timeline -iterations=2 < %s | FileCheck %s
3+
4+
movq $0x80, %rsp
5+
pushq %rax
6+
pushq %rcx
7+
pushq %rdx
8+
pushq %rbx
9+
pushq %r12
10+
11+
# CHECK: Iterations: 2
12+
# CHECK-NEXT: Instructions: 12
13+
# CHECK-NEXT: Total Cycles: 15
14+
# CHECK-NEXT: Total uOps: 32
15+
16+
# CHECK: Dispatch Width: 6
17+
# CHECK-NEXT: uOps Per Cycle: 2.13
18+
# CHECK-NEXT: IPC: 0.80
19+
# CHECK-NEXT: Block RThroughput: 5.0
20+
21+
# CHECK: Instruction Info:
22+
# CHECK-NEXT: [1]: #uOps
23+
# CHECK-NEXT: [2]: Latency
24+
# CHECK-NEXT: [3]: RThroughput
25+
# CHECK-NEXT: [4]: MayLoad
26+
# CHECK-NEXT: [5]: MayStore
27+
# CHECK-NEXT: [6]: HasSideEffects (U)
28+
29+
# CHECK: [1] [2] [3] [4] [5] [6] Instructions:
30+
# CHECK-NEXT: 1 1 0.25 movq $128, %rsp
31+
# CHECK-NEXT: 3 2 1.00 * pushq %rax
32+
# CHECK-NEXT: 3 2 1.00 * pushq %rcx
33+
# CHECK-NEXT: 3 2 1.00 * pushq %rdx
34+
# CHECK-NEXT: 3 2 1.00 * pushq %rbx
35+
# CHECK-NEXT: 3 2 1.00 * pushq %r12
36+
37+
# CHECK: Resources:
38+
# CHECK-NEXT: [0] - SKLDivider
39+
# CHECK-NEXT: [1] - SKLFPDivider
40+
# CHECK-NEXT: [2] - SKLPort0
41+
# CHECK-NEXT: [3] - SKLPort1
42+
# CHECK-NEXT: [4] - SKLPort2
43+
# CHECK-NEXT: [5] - SKLPort3
44+
# CHECK-NEXT: [6] - SKLPort4
45+
# CHECK-NEXT: [7] - SKLPort5
46+
# CHECK-NEXT: [8] - SKLPort6
47+
# CHECK-NEXT: [9] - SKLPort7
48+
49+
# CHECK: Resource pressure per iteration:
50+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9]
51+
# CHECK-NEXT: - - 1.50 1.50 1.50 1.50 5.00 1.50 1.50 2.00
52+
53+
# CHECK: Resource pressure by instruction:
54+
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
55+
# CHECK-NEXT: - - - - - - - - 1.00 - movq $128, %rsp
56+
# CHECK-NEXT: - - 0.50 - 0.50 - 1.00 0.50 - 0.50 pushq %rax
57+
# CHECK-NEXT: - - - 0.50 - 0.50 1.00 - 0.50 0.50 pushq %rcx
58+
# CHECK-NEXT: - - 0.50 - 0.50 0.50 1.00 0.50 - - pushq %rdx
59+
# CHECK-NEXT: - - - 0.50 0.50 - 1.00 0.50 - 0.50 pushq %rbx
60+
# CHECK-NEXT: - - 0.50 0.50 - 0.50 1.00 - - 0.50 pushq %r12
61+
62+
# CHECK: Timeline view:
63+
# CHECK-NEXT: 01234
64+
# CHECK-NEXT: Index 0123456789
65+
66+
# CHECK: [0,0] DeER . . . movq $128, %rsp
67+
# CHECK-NEXT: [0,1] D=eeER . . pushq %rax
68+
# CHECK-NEXT: [0,2] .D=eeER . . pushq %rcx
69+
# CHECK-NEXT: [0,3] .D==eeER . . pushq %rdx
70+
# CHECK-NEXT: [0,4] . D==eeER . . pushq %rbx
71+
# CHECK-NEXT: [0,5] . D===eeER. . pushq %r12
72+
# CHECK-NEXT: [1,0] . DeE---R. . movq $128, %rsp
73+
# CHECK-NEXT: [1,1] . D===eeER . pushq %rax
74+
# CHECK-NEXT: [1,2] . D===eeER . pushq %rcx
75+
# CHECK-NEXT: [1,3] . D====eeER . pushq %rdx
76+
# CHECK-NEXT: [1,4] . D====eeER. pushq %rbx
77+
# CHECK-NEXT: [1,5] . D=====eeER pushq %r12
78+
79+
# CHECK: Average Wait times (based on the timeline view):
80+
# CHECK-NEXT: [0]: Executions
81+
# CHECK-NEXT: [1]: Average time spent waiting in a scheduler's queue
82+
# CHECK-NEXT: [2]: Average time spent waiting in a scheduler's queue while ready
83+
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
84+
85+
# CHECK: [0] [1] [2] [3]
86+
# CHECK-NEXT: 0. 2 1.0 1.0 1.5 movq $128, %rsp
87+
# CHECK-NEXT: 1. 2 3.0 0.5 0.0 pushq %rax
88+
# CHECK-NEXT: 2. 2 3.0 1.0 0.0 pushq %rcx
89+
# CHECK-NEXT: 3. 2 4.0 1.0 0.0 pushq %rdx
90+
# CHECK-NEXT: 4. 2 4.0 1.0 0.0 pushq %rbx
91+
# CHECK-NEXT: 5. 2 5.0 1.0 0.0 pushq %r12
92+
# CHECK-NEXT: 2 3.3 0.9 0.3 <total>

0 commit comments

Comments
 (0)