Skip to content

Commit c790d9c

Browse files
committed
zen4 fix
1 parent 70906f0 commit c790d9c

File tree

1 file changed

+40
-40
lines changed

1 file changed

+40
-40
lines changed

llvm/lib/Target/X86/X86ScheduleZnver4.td

Lines changed: 40 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -9,45 +9,45 @@
99
// This file defines the machine model for Znver4 to support instruction
1010
// scheduling and other instruction cost heuristics.
1111
// Based on:
12-
// * AMD Software Optimization Guide for AMD Family 19h Processors.
13-
// https://www.amd.com/system/files/TechDocs/56665.zip
12+
// * AMD Software Optimization Guide for the AMD Zen4 Microarchitecture.
13+
// https://www.amd.com/system/files/TechDocs/57647.zip
1414
//===----------------------------------------------------------------------===//
1515

1616
def Znver4Model : SchedMachineModel {
17-
// AMD SOG 19h, 2.9.6 Dispatch
17+
// AMD SOG Zen4, 2.9.6 Dispatch
1818
// The processor may dispatch up to 6 macro ops per cycle
1919
// into the execution engine.
2020
let IssueWidth = 6;
21-
// AMD SOG 19h, 2.10.3
21+
// AMD SOG Zen4, 2.10.3
2222
// The retire control unit (RCU) tracks the completion status of all
2323
// outstanding operations (integer, load/store, and floating-point) and is
2424
// the final arbiter for exception processing and recovery.
2525
// The unit can receive up to 6 macro ops dispatched per cycle and track up
2626
// to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
2727
let MicroOpBufferSize = 320;
28-
// AMD SOG 19h, 2.9.1 Op Cache
28+
// AMD SOG Zen4, 2.9.1 Op Cache
2929
// The op cache is organized as an associative cache with 64 sets and 8 ways.
3030
// At each set-way intersection is an entry containing up to 8 macro ops.
3131
// The maximum capacity of the op cache is 6.75K ops.
3232
// Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
3333
// the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
3434
// unrolling leading to excessive filling of the op-cache from frontend.
3535
let LoopMicroOpBufferSize = 108;
36-
// AMD SOG 19h, 2.6.2 L1 Data Cache
36+
// AMD SOG Zen4, 2.6.2 L1 Data Cache
3737
// The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
38-
// AMD SOG 19h, 2.12 L1 Data Cache
38+
// AMD SOG Zen4, 2.12 L1 Data Cache
3939
// The AGU and LS pipelines are optimized for simple address generation modes.
4040
// <...> and can achieve 4-cycle load-to-use integer load latency.
4141
let LoadLatency = 4;
42-
// AMD SOG 19h, 2.12 L1 Data Cache
42+
// AMD SOG Zen4, 2.12 L1 Data Cache
4343
// The AGU and LS pipelines are optimized for simple address generation modes.
4444
// <...> and can achieve <...> 7-cycle load-to-use FP load latency.
4545
int VecLoadLatency = 7;
4646
// Latency of a simple store operation.
4747
int StoreLatency = 1;
4848
// FIXME:
4949
let HighLatency = 25; // FIXME: any better choice?
50-
// AMD SOG 19h, 2.8 Optimizing Branching
50+
// AMD SOG Zen4, 2.8 Optimizing Branching
5151
// The branch misprediction penalty is in the range from 11 to 18 cycles,
5252
// <...>. The common case penalty is 13 cycles.
5353
let MispredictPenalty = 13;
@@ -64,37 +64,37 @@ let SchedModel = Znver4Model in {
6464
// RCU
6565
//===----------------------------------------------------------------------===//
6666

67-
// AMD SOG 19h, 2.10.3 Retire Control Unit
67+
// AMD SOG Zen4, 2.10.3 Retire Control Unit
6868
// The unit can receive up to 6 macro ops dispatched per cycle and track up to
69-
// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
70-
// The retire unit handles in-order commit of up to nine macro ops per cycle.
71-
def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
69+
// 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. <...>
70+
// The retire unit handles in-order commit of up to eight macro ops per cycle.
71+
def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 8>;
7272

7373
//===----------------------------------------------------------------------===//
7474
// Integer Execution Unit
7575
//
7676

77-
// AMD SOG 19h, 2.4 Superscalar Organization
77+
// AMD SOG Zen4, 2.4 Superscalar Organization
7878
// The processor uses four decoupled independent integer scheduler queues,
7979
// each one servicing one ALU pipeline and one or two other pipelines
8080

8181
//
8282
// Execution pipes
8383
//===----------------------------------------------------------------------===//
8484

85-
// AMD SOG 19h, 2.10.2 Execution Units
85+
// AMD SOG Zen4, 2.10.2 Execution Units
8686
// The processor contains 4 general purpose integer execution pipes.
8787
// Each pipe has an ALU capable of general purpose integer operations.
8888
def Zn4ALU0 : ProcResource<1>;
8989
def Zn4ALU1 : ProcResource<1>;
9090
def Zn4ALU2 : ProcResource<1>;
9191
def Zn4ALU3 : ProcResource<1>;
9292

93-
// AMD SOG 19h, 2.10.2 Execution Units
93+
// AMD SOG Zen4, 2.10.2 Execution Units
9494
// There is also a separate branch execution unit.
9595
def Zn4BRU1 : ProcResource<1>;
9696

97-
// AMD SOG 19h, 2.10.2 Execution Units
97+
// AMD SOG Zen4, 2.10.2 Execution Units
9898
// There are three Address Generation Units (AGUs) for all load and store
9999
// address generation. There are also 3 store data movement units
100100
// associated with the same schedulers as the AGUs.
@@ -106,11 +106,11 @@ def Zn4AGU2 : ProcResource<1>;
106106
// Execution Units
107107
//===----------------------------------------------------------------------===//
108108

109-
// AMD SOG 19h, 2.10.2 Execution Units
109+
// AMD SOG Zen4, 2.10.2 Execution Units
110110
// ALU0 additionally has divide <...> execution capability.
111111
defvar Zn4Divider = Zn4ALU0;
112112

113-
// AMD SOG 19h, 2.10.2 Execution Units
113+
// AMD SOG Zen4, 2.10.2 Execution Units
114114
// ALU0 additionally has <...> branch execution capability.
115115
defvar Zn4BRU0 = Zn4ALU0;
116116

@@ -143,14 +143,14 @@ def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
143143
// Scheduling
144144
//===----------------------------------------------------------------------===//
145145

146-
// AMD SOG 19h, 2.10.3 Retire Control Unit
146+
// AMD SOG Zen4, 2.10.3 Retire Control Unit
147147
// The integer physical register file (PRF) consists of 224 registers.
148148
def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
149149
6, // Max moves that can be eliminated per cycle.
150150
0>; // Restrict move elimination to zero regs.
151151

152152
// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
153-
// AMD SOG 19h, 2.10.1 Schedulers
153+
// AMD SOG Zen4, 2.10.1 Schedulers
154154
// The schedulers can receive up to six macro ops per cycle, with a limit of
155155
// two per scheduler. Each scheduler can issue one micro op per cycle into
156156
// each of its associated pipelines
@@ -167,15 +167,15 @@ def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
167167
// Floating-Point Unit
168168
//
169169

170-
// AMD SOG 19h, 2.4 Superscalar Organization
170+
// AMD SOG Zen4, 2.4 Superscalar Organization
171171
// The processor uses <...> two decoupled independent floating point schedulers
172172
// each servicing two FP pipelines and one store or FP-to-integer pipeline.
173173

174174
//
175175
// Execution pipes
176176
//===----------------------------------------------------------------------===//
177177

178-
// AMD SOG 19h, 2.10.1 Schedulers
178+
// AMD SOG Zen4, 2.4 Superscalar Organization
179179
// <...>, and six FPU pipes.
180180
// Agner, 22.10 Floating point execution pipes
181181
// There are six floating point/vector execution pipes,
@@ -188,7 +188,7 @@ def Zn4FP45 : ProcResource<2>;
188188
//
189189
// Execution Units
190190
//===----------------------------------------------------------------------===//
191-
// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
191+
// AMD SOG Zen4, 2.11.1 Floating Point Execution Resources
192192

193193
// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
194194
defvar Zn4FPFMul0 = Zn4FP0;
@@ -203,7 +203,7 @@ defvar Zn4FPFCvt0 = Zn4FP2;
203203
defvar Zn4FPFCvt1 = Zn4FP3;
204204

205205
// All Divide and Square Root except Reciprocal Approximation
206-
// AMD SOG 19h, 2.11.1 Floating Point Execution Resources
206+
// AMD SOG Zen4, 2.11.1 Floating Point Execution Resources
207207
// FDIV unit can support 2 simultaneous operations in flight
208208
// even though it occupies a single pipe.
209209
// FIXME: BufferSize=2 ?
@@ -252,7 +252,7 @@ defvar Zn4FPCLM1 = Zn4FP1;
252252
// Execution pipeline grouping
253253
//===----------------------------------------------------------------------===//
254254

255-
// AMD SOG 19h, 2.11 Floating-Point Unit
255+
// AMD SOG Zen4, 2.11 Floating-Point Unit
256256
// Stores and floating point to general purpose register transfer
257257
// have 2 dedicated pipelines (pipe 5 and 6).
258258
def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
@@ -281,12 +281,12 @@ def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
281281
def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
282282

283283
// Loads, Stores and Move to General Register (EX) Operations
284-
// AMD SOG 19h, 2.11 Floating-Point Unit
284+
// AMD SOG Zen4, 2.11 Floating-Point Unit
285285
// Stores and floating point to general purpose register transfer
286286
// have 2 dedicated pipelines (pipe 5 and 6).
287287
defvar Zn4FPLd01 = Zn4FP45;
288288

289-
// AMD SOG 19h, 2.11 Floating-Point Unit
289+
// AMD SOG Zen4, 2.11 Floating-Point Unit
290290
// Note that FP stores are supported on two pipelines,
291291
// but throughput is limited to one per cycle.
292292
let Super = Zn4FP45 in
@@ -334,9 +334,9 @@ def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0,
334334
6, // Max moves that can be eliminated per cycle.
335335
0>; // Restrict move elimination to zero regs.
336336

337-
// AMD SOG 19h, 2.11 Floating-Point Unit
337+
// AMD SOG Zen4, 2.11 Floating-Point Unit
338338
// The floating-point scheduler has a 2*32 entry macro op capacity.
339-
// AMD SOG 19h, 2.11 Floating-Point Unit
339+
// AMD SOG Zen4, 2.11 Floating-Point Unit
340340
// <...> the scheduler can issue 1 micro op per cycle for each pipe.
341341
// FIXME: those are two separate schedulers, not a single big one.
342342
def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
@@ -345,7 +345,7 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
345345
let BufferSize = !mul(2, 32);
346346
}
347347

348-
// AMD SOG 19h, 2.11 Floating-Point Unit
348+
// AMD SOG Zen4, 2.11 Floating-Point Unit
349349
// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
350350
// even if floating-point scheduler is full.
351351
// FIXME: how to model this properly?
@@ -355,27 +355,27 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
355355
// Load-Store Unit
356356
//
357357

358-
// AMD SOG 19h, 2.12 Load-Store Unit
358+
// AMD SOG Zen4, 2.12 Load-Store Unit
359359
// The LS unit contains three largely independent pipe-lines
360-
// enabling the execution of three 256-bit memory operations per cycle.
360+
// enabling the execution of three memory operations per cycle.
361361
def Zn4LSU : ProcResource<3>;
362362

363-
// AMD SOG 19h, 2.12 Load-Store Unit
363+
// AMD SOG Zen4, 2.12 Load-Store Unit
364364
// All three memory operations can be loads.
365365
let Super = Zn4LSU in
366366
def Zn4Load : ProcResource<3> {
367-
// AMD SOG 19h, 2.12 Load-Store Unit
368-
// The LS unit can process up to 72 out-of-order loads.
369-
let BufferSize = 72;
367+
// AMD SOG Zen4, 2.12 Load-Store Unit
368+
// The LS can track up to 48 uncompleted loads and up to 88 completed loads.
369+
let BufferSize = 88;
370370
}
371371

372372
def Zn4LoadQueue : LoadQueue<Zn4Load>;
373373

374-
// AMD SOG 19h, 2.12 Load-Store Unit
374+
// AMD SOG Zen4, 2.12 Load-Store Unit
375375
// A maximum of two of the memory operations can be stores.
376376
let Super = Zn4LSU in
377377
def Zn4Store : ProcResource<2> {
378-
// AMD SOG 19h, 2.12 Load-Store Unit
378+
// AMD SOG Zen4, 2.12 Load-Store Unit
379379
// The LS unit utilizes a 64-entry store queue (STQ).
380380
let BufferSize = 64;
381381
}
@@ -491,7 +491,7 @@ def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
491491
def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
492492
def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
493493

494-
// AMD SOG 19h, 2.11 Floating-Point Unit
494+
// AMD SOG Zen4, 2.11 Floating-Point Unit
495495
// There is 1 cycle of added latency for a result to cross
496496
// from F to I or I to F domain.
497497
def : ReadAdvance<ReadInt2Fpu, -1>;

0 commit comments

Comments
 (0)