1515//===----------------------------------------------------------------------===//
1616
1717def Znver4Model : SchedMachineModel {
18- // AMD SOG Zen4, 2.9.6 Dispatch
18+ // AMD SOG Zen4, 2.9.8 Dispatch
1919 // The processor may dispatch up to 6 macro ops per cycle
2020 // into the execution engine.
2121 let IssueWidth = 6;
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
4646 int VecLoadLatency = 7;
4747 // Latency of a simple store operation.
4848 int StoreLatency = 1;
49- // FIXME:
50- let HighLatency = 25; // FIXME: any better choice?
49+ // Mean and median value for all instructions with latencies >6
50+ // Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
51+ let HighLatency = 13;
5152 // AMD SOG Zen4, 2.8 Optimizing Branching
5253 // The branch misprediction penalty is in the range from 11 to 18 cycles,
5354 // <...>. The common case penalty is 13 cycles.
@@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[
612613
613614def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
614615
616+ // values from uops.info
615617def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
616618 let Latency = 2; // FIXME: not from llvm-exegesis
617619 let ReleaseAtCycles = [4];
@@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
659661
660662def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
661663 let Latency = 3; // FIXME: not from llvm-exegesis
662- let ReleaseAtCycles = [24 ];
663- let NumMicroOps = 19 ;
664+ let ReleaseAtCycles = [20 ];
665+ let NumMicroOps = 15 ;
664666}
665667def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
666668
667669def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
668- let Latency = 4 ; // FIXME: not from llvm-exegesis
669- let ReleaseAtCycles = [59 ];
670- let NumMicroOps = 28 ;
670+ let Latency = 2 ; // FIXME: not from llvm-exegesis
671+ let ReleaseAtCycles = [40 ];
672+ let NumMicroOps = 26 ;
671673}
672674def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
673675
@@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
681683def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
682684 let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
683685 let ReleaseAtCycles = [1, 1, 2];
684- let NumMicroOps = 5 ;
686+ let NumMicroOps = 2 ;
685687}
686688def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
687689
@@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
693695def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
694696
695697// Integer division.
696- // FIXME: uops for 8-bit division measures as 2. for others it's a guess.
697- // FIXME: latency for 8-bit division measures as 10. for others it's a guess.
698- defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
699- defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
700- defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
701- defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
702- defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
703- defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
704- defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
705- defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
706-
707- defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
708- defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
698+ defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
699+ defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
700+ defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
701+ defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
702+ defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
703+ defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
704+ defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
705+ defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
706+
707+ defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
708+ defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
709709
710710defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
711711
@@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
725725}
726726def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
727727
728- defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2 , [1], 2 >; // Trailing zero count.
728+ defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1 , [1], 1 >; // Trailing zero count.
729729
730730def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
731- let Latency = 2 ;
732- let ReleaseAtCycles = [4 ];
733- let NumMicroOps = 2 ;
731+ let Latency = 1 ;
732+ let ReleaseAtCycles = [1 ];
733+ let NumMicroOps = 1 ;
734734}
735735def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
736736
@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
11091109}
11101110def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
11111111
1112- def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1113- // TODO: All align instructions are expected to be of 4 cycle latency
1114- let Latency = 4 ;
1112+ // 128-bit VALIGN
1113+ def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1114+ let Latency = 2 ;
11151115 let ReleaseAtCycles = [1];
11161116 let NumMicroOps = 1;
11171117}
1118- def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1119- VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1120- >;
1118+
1119+ // 256-bit VALIGN
1120+ def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1121+ let Latency = 3;
1122+ let ReleaseAtCycles = [1];
1123+ let NumMicroOps = 1;
1124+ }
1125+
1126+ // 512-bit VALIGN
1127+ def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1128+ let Latency = 4;
1129+ let ReleaseAtCycles = [2];
1130+ let NumMicroOps = 1;
1131+ }
1132+
1133+ def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
1134+ def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
1135+ def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
1136+
11211137defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
11221138
11231139def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
@@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
13261342
13271343// Strings instructions.
13281344// Packed Compare Implicit Length Strings, Return Mask
1329- defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6 , [8], 3, /*LoadUOps=*/1>;
1345+ defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7 , [8], 3, /*LoadUOps=*/1>;
13301346// Packed Compare Explicit Length Strings, Return Mask
1331- defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6 , [12], 7, /*LoadUOps=*/5>;
1347+ defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7 , [12], 7, /*LoadUOps=*/5>;
13321348// Packed Compare Implicit Length Strings, Return Index
13331349defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
13341350// Packed Compare Explicit Length Strings, Return Index
@@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
13401356defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
13411357
13421358// Carry-less multiplication instructions.
1343- defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4 ], 4>;
1359+ defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3 ], 4>;
13441360
13451361// EMMS/FEMMS
13461362defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
13861402def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
13871403
13881404def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1389- let Latency = 7 ;
1405+ let Latency = 4 ;
13901406 let ReleaseAtCycles = [1];
1391- let NumMicroOps = 2 ;
1407+ let NumMicroOps = 1 ;
13921408}
13931409def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
13941410
13951411def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
13961412 let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
1397- let ReleaseAtCycles = [1, 1, 2 ];
1398- let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1) ;
1413+ let ReleaseAtCycles = [1, 1, 1 ];
1414+ let NumMicroOps = 1 ;
13991415}
14001416def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
14011417
14021418def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1403- let Latency = 6 ;
1419+ let Latency = 4 ;
14041420 let ReleaseAtCycles = [1];
1405- let NumMicroOps = 2 ;
1421+ let NumMicroOps = 1 ;
14061422}
14071423def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
14081424
14091425def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
14101426 let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
1411- let ReleaseAtCycles = [1, 1, 2 ];
1412- let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1) ;
1427+ let ReleaseAtCycles = [1, 1, 1 ];
1428+ let NumMicroOps = 1 ;
14131429}
14141430def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
14151431
14161432def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1417- let Latency = 5 ;
1433+ let Latency = 4 ;
14181434 let ReleaseAtCycles = [1];
1419- let NumMicroOps = 2 ;
1435+ let NumMicroOps = 1 ;
14201436}
14211437def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
14221438
14231439def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
14241440 let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
1425- let ReleaseAtCycles = [1, 1, 2 ];
1426- let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0) ;
1441+ let ReleaseAtCycles = [1, 1, 1 ];
1442+ let NumMicroOps = 1 ;
14271443}
14281444def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
14291445
0 commit comments