Skip to content

Commit 218a2a7

Browse files
NexusXeaokblast
authored andcommitted
[X86] Fix some values for Znver4 model (llvm#161405)
This PR fixes a handful of latency and uop changes between Znver3 and Znver4 that were otherwise copied from Znver3. Latency and uop values listed that matched Zen3 on uops.info were updated to those for Zen4. Includes: BSF/BSR, DIV, TZCNT, CLMUL, PCMPISTRM, VALIGN, VPERM
1 parent c5415ee commit 218a2a7

13 files changed

+188
-172
lines changed

llvm/lib/Target/X86/X86ScheduleZnver4.td

Lines changed: 63 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
//===----------------------------------------------------------------------===//
1616

1717
def Znver4Model : SchedMachineModel {
18-
// AMD SOG Zen4, 2.9.6 Dispatch
18+
// AMD SOG Zen4, 2.9.8 Dispatch
1919
// The processor may dispatch up to 6 macro ops per cycle
2020
// into the execution engine.
2121
let IssueWidth = 6;
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
4646
int VecLoadLatency = 7;
4747
// Latency of a simple store operation.
4848
int StoreLatency = 1;
49-
// FIXME:
50-
let HighLatency = 25; // FIXME: any better choice?
49+
// Mean and median value for all instructions with latencies >6
50+
// Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
51+
let HighLatency = 13;
5152
// AMD SOG Zen4, 2.8 Optimizing Branching
5253
// The branch misprediction penalty is in the range from 11 to 18 cycles,
5354
// <...>. The common case penalty is 13 cycles.
@@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[
612613

613614
def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
614615

616+
// values from uops.info
615617
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
616618
let Latency = 2; // FIXME: not from llvm-exegesis
617619
let ReleaseAtCycles = [4];
@@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
659661

660662
def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
661663
let Latency = 3; // FIXME: not from llvm-exegesis
662-
let ReleaseAtCycles = [24];
663-
let NumMicroOps = 19;
664+
let ReleaseAtCycles = [20];
665+
let NumMicroOps = 15;
664666
}
665667
def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
666668

667669
def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
668-
let Latency = 4; // FIXME: not from llvm-exegesis
669-
let ReleaseAtCycles = [59];
670-
let NumMicroOps = 28;
670+
let Latency = 2; // FIXME: not from llvm-exegesis
671+
let ReleaseAtCycles = [40];
672+
let NumMicroOps = 26;
671673
}
672674
def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
673675

@@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
681683
def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
682684
let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
683685
let ReleaseAtCycles = [1, 1, 2];
684-
let NumMicroOps = 5;
686+
let NumMicroOps = 2;
685687
}
686688
def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
687689

@@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
693695
def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
694696

695697
// Integer division.
696-
// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
697-
// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
698-
defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
699-
defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
700-
defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
701-
defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
702-
defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
703-
defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
704-
defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
705-
defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
706-
707-
defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
708-
defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
698+
defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
699+
defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
700+
defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
701+
defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
702+
defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
703+
defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
704+
defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
705+
defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
706+
707+
defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
708+
defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
709709

710710
defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
711711

@@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
725725
}
726726
def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
727727

728-
defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
728+
defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.
729729

730730
def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
731-
let Latency = 2;
732-
let ReleaseAtCycles = [4];
733-
let NumMicroOps = 2;
731+
let Latency = 1;
732+
let ReleaseAtCycles = [1];
733+
let NumMicroOps = 1;
734734
}
735735
def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
736736

@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
11091109
}
11101110
def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
11111111

1112-
def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1113-
// TODO: All align instructions are expected to be of 4 cycle latency
1114-
let Latency = 4;
1112+
// 128-bit VALIGN
1113+
def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1114+
let Latency = 2;
11151115
let ReleaseAtCycles = [1];
11161116
let NumMicroOps = 1;
11171117
}
1118-
def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1119-
VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1120-
>;
1118+
1119+
// 256-bit VALIGN
1120+
def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1121+
let Latency = 3;
1122+
let ReleaseAtCycles = [1];
1123+
let NumMicroOps = 1;
1124+
}
1125+
1126+
// 512-bit VALIGN
1127+
def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1128+
let Latency = 4;
1129+
let ReleaseAtCycles = [2];
1130+
let NumMicroOps = 1;
1131+
}
1132+
1133+
def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
1134+
def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
1135+
def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
1136+
11211137
defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
11221138

11231139
def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
@@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
13261342

13271343
// Strings instructions.
13281344
// Packed Compare Implicit Length Strings, Return Mask
1329-
defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
1345+
defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
13301346
// Packed Compare Explicit Length Strings, Return Mask
1331-
defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
1347+
defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
13321348
// Packed Compare Implicit Length Strings, Return Index
13331349
defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
13341350
// Packed Compare Explicit Length Strings, Return Index
@@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
13401356
defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
13411357

13421358
// Carry-less multiplication instructions.
1343-
defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
1359+
defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;
13441360

13451361
// EMMS/FEMMS
13461362
defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
13861402
def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
13871403

13881404
def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
1389-
let Latency = 7;
1405+
let Latency = 4;
13901406
let ReleaseAtCycles = [1];
1391-
let NumMicroOps = 2;
1407+
let NumMicroOps = 1;
13921408
}
13931409
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
13941410

13951411
def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
13961412
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
1397-
let ReleaseAtCycles = [1, 1, 2];
1398-
let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
1413+
let ReleaseAtCycles = [1, 1, 1];
1414+
let NumMicroOps = 1;
13991415
}
14001416
def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
14011417

14021418
def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
1403-
let Latency = 6;
1419+
let Latency = 4;
14041420
let ReleaseAtCycles = [1];
1405-
let NumMicroOps = 2;
1421+
let NumMicroOps = 1;
14061422
}
14071423
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
14081424

14091425
def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
14101426
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
1411-
let ReleaseAtCycles = [1, 1, 2];
1412-
let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
1427+
let ReleaseAtCycles = [1, 1, 1];
1428+
let NumMicroOps = 1;
14131429
}
14141430
def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
14151431

14161432
def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
1417-
let Latency = 5;
1433+
let Latency = 4;
14181434
let ReleaseAtCycles = [1];
1419-
let NumMicroOps = 2;
1435+
let NumMicroOps = 1;
14201436
}
14211437
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
14221438

14231439
def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
14241440
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
1425-
let ReleaseAtCycles = [1, 1, 2];
1426-
let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
1441+
let ReleaseAtCycles = [1, 1, 1];
1442+
let NumMicroOps = 1;
14271443
}
14281444
def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
14291445

llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1403,8 +1403,8 @@ vzeroupper
14031403
# CHECK-NEXT: 1 8 0.50 * vpblendvb %xmm3, (%rax), %xmm1, %xmm2
14041404
# CHECK-NEXT: 1 1 0.25 vpblendw $11, %xmm0, %xmm1, %xmm2
14051405
# CHECK-NEXT: 1 8 0.50 * vpblendw $11, (%rax), %xmm1, %xmm2
1406-
# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm0, %xmm1, %xmm2
1407-
# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm1, %xmm2
1406+
# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm0, %xmm1, %xmm2
1407+
# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm1, %xmm2
14081408
# CHECK-NEXT: 1 1 0.25 vpcmpeqb %xmm0, %xmm1, %xmm2
14091409
# CHECK-NEXT: 1 8 0.50 * vpcmpeqb (%rax), %xmm1, %xmm2
14101410
# CHECK-NEXT: 1 1 0.25 vpcmpeqd %xmm0, %xmm1, %xmm2
@@ -1415,8 +1415,8 @@ vzeroupper
14151415
# CHECK-NEXT: 1 8 0.50 * vpcmpeqw (%rax), %xmm1, %xmm2
14161416
# CHECK-NEXT: 8 6 3.00 vpcmpestri $1, %xmm0, %xmm2
14171417
# CHECK-NEXT: 12 13 3.00 * vpcmpestri $1, (%rax), %xmm2
1418-
# CHECK-NEXT: 7 6 3.00 vpcmpestrm $1, %xmm0, %xmm2
1419-
# CHECK-NEXT: 12 13 3.00 * vpcmpestrm $1, (%rax), %xmm2
1418+
# CHECK-NEXT: 7 7 3.00 vpcmpestrm $1, %xmm0, %xmm2
1419+
# CHECK-NEXT: 12 14 3.00 * vpcmpestrm $1, (%rax), %xmm2
14201420
# CHECK-NEXT: 1 1 0.25 vpcmpgtb %xmm0, %xmm1, %xmm2
14211421
# CHECK-NEXT: 1 8 0.50 * vpcmpgtb (%rax), %xmm1, %xmm2
14221422
# CHECK-NEXT: 1 1 0.25 vpcmpgtd %xmm0, %xmm1, %xmm2
@@ -1427,8 +1427,8 @@ vzeroupper
14271427
# CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %xmm1, %xmm2
14281428
# CHECK-NEXT: 4 2 2.00 vpcmpistri $1, %xmm0, %xmm2
14291429
# CHECK-NEXT: 4 9 2.00 * vpcmpistri $1, (%rax), %xmm2
1430-
# CHECK-NEXT: 3 6 2.00 vpcmpistrm $1, %xmm0, %xmm2
1431-
# CHECK-NEXT: 4 13 2.00 * vpcmpistrm $1, (%rax), %xmm2
1430+
# CHECK-NEXT: 3 7 2.00 vpcmpistrm $1, %xmm0, %xmm2
1431+
# CHECK-NEXT: 4 14 2.00 * vpcmpistrm $1, (%rax), %xmm2
14321432
# CHECK-NEXT: 1 3 1.00 vperm2f128 $1, %ymm0, %ymm1, %ymm2
14331433
# CHECK-NEXT: 1 10 1.00 * vperm2f128 $1, (%rax), %ymm1, %ymm2
14341434
# CHECK-NEXT: 1 1 0.50 vpermilpd $1, %xmm0, %xmm2
@@ -1749,7 +1749,7 @@ vzeroupper
17491749

17501750
# CHECK: Resource pressure per iteration:
17511751
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
1752-
# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 393.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
1752+
# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 204.25 392.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
17531753

17541754
# CHECK: Resource pressure by instruction:
17551755
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2126,8 +2126,8 @@ vzeroupper
21262126
# CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendvb %xmm3, (%rax), %xmm1, %xmm2
21272127
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpblendw $11, %xmm0, %xmm1, %xmm2
21282128
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendw $11, (%rax), %xmm1, %xmm2
2129-
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2
2130-
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2
2129+
# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2
2130+
# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2
21312131
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqb %xmm0, %xmm1, %xmm2
21322132
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpcmpeqb (%rax), %xmm1, %xmm2
21332133
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqd %xmm0, %xmm1, %xmm2

0 commit comments

Comments
 (0)