99// This file defines the machine model for Znver4 to support instruction
1010// scheduling and other instruction cost heuristics.
1111// Based on:
12- // * AMD Software Optimization Guide for AMD Family 19h Processors.
13- // https://www.amd.com/system/files/TechDocs/56665.zip
12+ // * AMD Software Optimization Guide for the AMD Family 19h (Zen4)
13+ // Microarchitecture
14+ // https://www.amd.com/system/files/TechDocs/57647.zip
1415//===----------------------------------------------------------------------===//
1516
1617def Znver4Model : SchedMachineModel {
17- // AMD SOG 19h , 2.9.6 Dispatch
18+ // AMD SOG Zen4 , 2.9.6 Dispatch
1819 // The processor may dispatch up to 6 macro ops per cycle
1920 // into the execution engine.
2021 let IssueWidth = 6;
21- // AMD SOG 19h , 2.10.3
22+ // AMD SOG Zen4 , 2.10.3
2223 // The retire control unit (RCU) tracks the completion status of all
2324 // outstanding operations (integer, load/store, and floating-point) and is
2425 // the final arbiter for exception processing and recovery.
2526 // The unit can receive up to 6 macro ops dispatched per cycle and track up
2627 // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode.
2728 let MicroOpBufferSize = 320;
28- // AMD SOG 19h , 2.9.1 Op Cache
29+ // AMD SOG Zen4 , 2.9.1 Op Cache
2930 // The op cache is organized as an associative cache with 64 sets and 8 ways.
3031 // At each set-way intersection is an entry containing up to 8 macro ops.
3132 // The maximum capacity of the op cache is 6.75K ops.
3233 // Assuming a maximum dispatch of 9 ops/cy and a mispredict cost of 12cy from
3334 // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop
3435 // unrolling leading to excessive filling of the op-cache from frontend.
3536 let LoopMicroOpBufferSize = 108;
36- // AMD SOG 19h , 2.6.2 L1 Data Cache
37+ // AMD SOG Zen4 , 2.6.2 L1 Data Cache
3738 // The L1 data cache has a 4- or 5- cycle integer load-to-use latency.
38- // AMD SOG 19h , 2.12 L1 Data Cache
39+ // AMD SOG Zen4 , 2.12 L1 Data Cache
3940 // The AGU and LS pipelines are optimized for simple address generation modes.
4041 // <...> and can achieve 4-cycle load-to-use integer load latency.
4142 let LoadLatency = 4;
42- // AMD SOG 19h , 2.12 L1 Data Cache
43+ // AMD SOG Zen4 , 2.12 L1 Data Cache
4344 // The AGU and LS pipelines are optimized for simple address generation modes.
4445 // <...> and can achieve <...> 7-cycle load-to-use FP load latency.
4546 int VecLoadLatency = 7;
4647 // Latency of a simple store operation.
4748 int StoreLatency = 1;
4849 // FIXME:
4950 let HighLatency = 25; // FIXME: any better choice?
50- // AMD SOG 19h , 2.8 Optimizing Branching
51+ // AMD SOG Zen4 , 2.8 Optimizing Branching
5152 // The branch misprediction penalty is in the range from 11 to 18 cycles,
5253 // <...>. The common case penalty is 13 cycles.
5354 let MispredictPenalty = 13;
@@ -64,7 +65,7 @@ let SchedModel = Znver4Model in {
6465// RCU
6566//===----------------------------------------------------------------------===//
6667
67- // AMD SOG 19h , 2.10.3 Retire Control Unit
68+ // AMD SOG Zen4 , 2.10.3 Retire Control Unit
6869// The unit can receive up to 6 macro ops dispatched per cycle and track up to
6970// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...>
7071// The retire unit handles in-order commit of up to nine macro ops per cycle.
@@ -74,27 +75,27 @@ def Zn4RCU : RetireControlUnit<Znver4Model.MicroOpBufferSize, 9>;
7475// Integer Execution Unit
7576//
7677
77- // AMD SOG 19h , 2.4 Superscalar Organization
78+ // AMD SOG Zen4 , 2.4 Superscalar Organization
7879// The processor uses four decoupled independent integer scheduler queues,
7980// each one servicing one ALU pipeline and one or two other pipelines
8081
8182//
8283// Execution pipes
8384//===----------------------------------------------------------------------===//
8485
85- // AMD SOG 19h , 2.10.2 Execution Units
86+ // AMD SOG Zen4 , 2.10.2 Execution Units
8687// The processor contains 4 general purpose integer execution pipes.
8788// Each pipe has an ALU capable of general purpose integer operations.
8889def Zn4ALU0 : ProcResource<1>;
8990def Zn4ALU1 : ProcResource<1>;
9091def Zn4ALU2 : ProcResource<1>;
9192def Zn4ALU3 : ProcResource<1>;
9293
93- // AMD SOG 19h , 2.10.2 Execution Units
94+ // AMD SOG Zen4 , 2.10.2 Execution Units
9495// There is also a separate branch execution unit.
9596def Zn4BRU1 : ProcResource<1>;
9697
97- // AMD SOG 19h , 2.10.2 Execution Units
98+ // AMD SOG Zen4 , 2.10.2 Execution Units
9899// There are three Address Generation Units (AGUs) for all load and store
99100// address generation. There are also 3 store data movement units
100101// associated with the same schedulers as the AGUs.
@@ -106,11 +107,11 @@ def Zn4AGU2 : ProcResource<1>;
106107// Execution Units
107108//===----------------------------------------------------------------------===//
108109
109- // AMD SOG 19h , 2.10.2 Execution Units
110+ // AMD SOG Zen4 , 2.10.2 Execution Units
110111// ALU0 additionally has divide <...> execution capability.
111112defvar Zn4Divider = Zn4ALU0;
112113
113- // AMD SOG 19h , 2.10.2 Execution Units
114+ // AMD SOG Zen4 , 2.10.2 Execution Units
114115// ALU0 additionally has <...> branch execution capability.
115116defvar Zn4BRU0 = Zn4ALU0;
116117
@@ -143,14 +144,14 @@ def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>;
143144// Scheduling
144145//===----------------------------------------------------------------------===//
145146
146- // AMD SOG 19h , 2.10.3 Retire Control Unit
147+ // AMD SOG Zen4 , 2.10.3 Retire Control Unit
147148// The integer physical register file (PRF) consists of 224 registers.
148149def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0],
149150 6, // Max moves that can be eliminated per cycle.
150151 0>; // Restrict move elimination to zero regs.
151152
152153// anandtech, The integer scheduler has a 4*24 entry macro op capacity.
153- // AMD SOG 19h , 2.10.1 Schedulers
154+ // AMD SOG Zen4 , 2.10.1 Schedulers
154155// The schedulers can receive up to six macro ops per cycle, with a limit of
155156// two per scheduler. Each scheduler can issue one micro op per cycle into
156157// each of its associated pipelines
@@ -167,15 +168,15 @@ def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0
167168// Floating-Point Unit
168169//
169170
170- // AMD SOG 19h , 2.4 Superscalar Organization
171+ // AMD SOG Zen4 , 2.4 Superscalar Organization
171172// The processor uses <...> two decoupled independent floating point schedulers
172173// each servicing two FP pipelines and one store or FP-to-integer pipeline.
173174
174175//
175176// Execution pipes
176177//===----------------------------------------------------------------------===//
177178
178- // AMD SOG 19h , 2.10.1 Schedulers
179+ // AMD SOG Zen4 , 2.10.1 Schedulers
179180// <...>, and six FPU pipes.
180181// Agner, 22.10 Floating point execution pipes
181182// There are six floating point/vector execution pipes,
@@ -188,7 +189,7 @@ def Zn4FP45 : ProcResource<2>;
188189//
189190// Execution Units
190191//===----------------------------------------------------------------------===//
191- // AMD SOG 19h , 2.11.1 Floating Point Execution Resources
192+ // AMD SOG Zen4 , 2.11.1 Floating Point Execution Resources
192193
193194// (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ)
194195defvar Zn4FPFMul0 = Zn4FP0;
@@ -203,7 +204,7 @@ defvar Zn4FPFCvt0 = Zn4FP2;
203204defvar Zn4FPFCvt1 = Zn4FP3;
204205
205206// All Divide and Square Root except Reciprocal Approximation
206- // AMD SOG 19h , 2.11.1 Floating Point Execution Resources
207+ // AMD SOG Zen4 , 2.11.1 Floating Point Execution Resources
207208// FDIV unit can support 2 simultaneous operations in flight
208209// even though it occupies a single pipe.
209210// FIXME: BufferSize=2 ?
@@ -252,7 +253,7 @@ defvar Zn4FPCLM1 = Zn4FP1;
252253// Execution pipeline grouping
253254//===----------------------------------------------------------------------===//
254255
255- // AMD SOG 19h , 2.11 Floating-Point Unit
256+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
256257// Stores and floating point to general purpose register transfer
257258// have 2 dedicated pipelines (pipe 5 and 6).
258259def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>;
@@ -281,12 +282,12 @@ def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>;
281282def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>;
282283
283284// Loads, Stores and Move to General Register (EX) Operations
284- // AMD SOG 19h , 2.11 Floating-Point Unit
285+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
285286// Stores and floating point to general purpose register transfer
286287// have 2 dedicated pipelines (pipe 5 and 6).
287288defvar Zn4FPLd01 = Zn4FP45;
288289
289- // AMD SOG 19h , 2.11 Floating-Point Unit
290+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
290291// Note that FP stores are supported on two pipelines,
291292// but throughput is limited to one per cycle.
292293let Super = Zn4FP45 in
@@ -334,9 +335,9 @@ def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0,
334335 6, // Max moves that can be eliminated per cycle.
335336 0>; // Restrict move elimination to zero regs.
336337
337- // AMD SOG 19h , 2.11 Floating-Point Unit
338+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
338339// The floating-point scheduler has a 2*32 entry macro op capacity.
339- // AMD SOG 19h , 2.11 Floating-Point Unit
340+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
340341// <...> the scheduler can issue 1 micro op per cycle for each pipe.
341342// FIXME: those are two separate schedulers, not a single big one.
342343def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
@@ -345,7 +346,7 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
345346 let BufferSize = !mul(2, 32);
346347}
347348
348- // AMD SOG 19h , 2.11 Floating-Point Unit
349+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
349350// Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ)
350351// even if floating-point scheduler is full.
351352// FIXME: how to model this properly?
@@ -355,27 +356,27 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0
355356// Load-Store Unit
356357//
357358
358- // AMD SOG 19h , 2.12 Load-Store Unit
359+ // AMD SOG Zen4 , 2.12 Load-Store Unit
359360// The LS unit contains three largely independent pipe-lines
360361// enabling the execution of three 256-bit memory operations per cycle.
361362def Zn4LSU : ProcResource<3>;
362363
363- // AMD SOG 19h , 2.12 Load-Store Unit
364+ // AMD SOG Zen4 , 2.12 Load-Store Unit
364365// All three memory operations can be loads.
365366let Super = Zn4LSU in
366367def Zn4Load : ProcResource<3> {
367- // AMD SOG 19h , 2.12 Load-Store Unit
368+ // AMD SOG Zen4 , 2.12 Load-Store Unit
368369 // The LS unit can process up to 72 out-of-order loads.
369370 let BufferSize = 72;
370371}
371372
372373def Zn4LoadQueue : LoadQueue<Zn4Load>;
373374
374- // AMD SOG 19h , 2.12 Load-Store Unit
375+ // AMD SOG Zen4 , 2.12 Load-Store Unit
375376// A maximum of two of the memory operations can be stores.
376377let Super = Zn4LSU in
377378def Zn4Store : ProcResource<2> {
378- // AMD SOG 19h , 2.12 Load-Store Unit
379+ // AMD SOG Zen4 , 2.12 Load-Store Unit
379380 // The LS unit utilizes a 64-entry store queue (STQ).
380381 let BufferSize = 64;
381382}
@@ -491,7 +492,7 @@ def : ReadAdvance<ReadAfterVecLd, Znver4Model.VecLoadLatency>;
491492def : ReadAdvance<ReadAfterVecXLd, Znver4Model.VecLoadLatency>;
492493def : ReadAdvance<ReadAfterVecYLd, Znver4Model.VecLoadLatency>;
493494
494- // AMD SOG 19h , 2.11 Floating-Point Unit
495+ // AMD SOG Zen4 , 2.11 Floating-Point Unit
495496// There is 1 cycle of added latency for a result to cross
496497// from F to I or I to F domain.
497498def : ReadAdvance<ReadInt2Fpu, -1>;
0 commit comments