Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 63 additions & 47 deletions llvm/lib/Target/X86/X86ScheduleZnver4.td
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
//===----------------------------------------------------------------------===//

def Znver4Model : SchedMachineModel {
// AMD SOG Zen4, 2.9.6 Dispatch
// AMD SOG Zen4, 2.9.8 Dispatch
// The processor may dispatch up to 6 macro ops per cycle
// into the execution engine.
let IssueWidth = 6;
Expand Down Expand Up @@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
int VecLoadLatency = 7;
// Latency of a simple store operation.
int StoreLatency = 1;
// FIXME:
let HighLatency = 25; // FIXME: any better choice?
// Mean and median value for all instructions with latencies >6
// Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
let HighLatency = 13;
// AMD SOG Zen4, 2.8 Optimizing Branching
// The branch misprediction penalty is in the range from 11 to 18 cycles,
// <...>. The common case penalty is 13 cycles.
Expand Down Expand Up @@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[

def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;

// values from uops.info
def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 2; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [4];
Expand Down Expand Up @@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;

def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 3; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [24];
let NumMicroOps = 19;
let ReleaseAtCycles = [20];
let NumMicroOps = 15;
}
def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;

def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 4; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [59];
let NumMicroOps = 28;
let Latency = 2; // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [40];
let NumMicroOps = 26;
}
def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;

Expand All @@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = 5;
let NumMicroOps = 2;
}
def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;

Expand All @@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;

// Integer division.
// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;

defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;

defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.

defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.

Expand All @@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
}
def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;

defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.

def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
let Latency = 2;
let ReleaseAtCycles = [4];
let NumMicroOps = 2;
let Latency = 1;
let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;

Expand Down Expand Up @@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
}
def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;

def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
// TODO: All align instructions are expected to be of 4 cycle latency
let Latency = 4;
// 128-bit VALIGN
def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
let Latency = 2;
let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
>;

// 256-bit VALIGN
def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
let Latency = 3;
let ReleaseAtCycles = [1];
let NumMicroOps = 1;
}

// 512-bit VALIGN
def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
let Latency = 4;
let ReleaseAtCycles = [2];
let NumMicroOps = 1;
}

def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;

defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).

def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
Expand Down Expand Up @@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;

// Strings instructions.
// Packed Compare Implicit Length Strings, Return Mask
defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
// Packed Compare Explicit Length Strings, Return Mask
defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
// Packed Compare Implicit Length Strings, Return Index
defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
// Packed Compare Explicit Length Strings, Return Index
Expand All @@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.

// Carry-less multiplication instructions.
defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;

// EMMS/FEMMS
defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
Expand Down Expand Up @@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;

def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
let Latency = 7;
let Latency = 4;
let ReleaseAtCycles = [1];
let NumMicroOps = 2;
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;

def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;

def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
let Latency = 6;
let Latency = 4;
let ReleaseAtCycles = [1];
let NumMicroOps = 2;
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;

def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;

def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
let Latency = 5;
let Latency = 4;
let ReleaseAtCycles = [1];
let NumMicroOps = 2;
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;

def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
let ReleaseAtCycles = [1, 1, 2];
let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
let ReleaseAtCycles = [1, 1, 1];
let NumMicroOps = 1;
}
def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;

Expand Down
18 changes: 9 additions & 9 deletions llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
Original file line number Diff line number Diff line change
Expand Up @@ -1403,8 +1403,8 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpblendvb %xmm3, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpblendw $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpblendw $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 4 4 2.00 vpclmulqdq $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 4 11 2.00 * vpclmulqdq $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 4 4 1.50 vpclmulqdq $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 4 11 1.50 * vpclmulqdq $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpeqb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpcmpeqb (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpeqd %xmm0, %xmm1, %xmm2
Expand All @@ -1415,8 +1415,8 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpcmpeqw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 8 6 3.00 vpcmpestri $1, %xmm0, %xmm2
# CHECK-NEXT: 12 13 3.00 * vpcmpestri $1, (%rax), %xmm2
# CHECK-NEXT: 7 6 3.00 vpcmpestrm $1, %xmm0, %xmm2
# CHECK-NEXT: 12 13 3.00 * vpcmpestrm $1, (%rax), %xmm2
# CHECK-NEXT: 7 7 3.00 vpcmpestrm $1, %xmm0, %xmm2
# CHECK-NEXT: 12 14 3.00 * vpcmpestrm $1, (%rax), %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpgtb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: 1 8 0.50 * vpcmpgtb (%rax), %xmm1, %xmm2
# CHECK-NEXT: 1 1 0.25 vpcmpgtd %xmm0, %xmm1, %xmm2
Expand All @@ -1427,8 +1427,8 @@ vzeroupper
# CHECK-NEXT: 1 8 0.50 * vpcmpgtw (%rax), %xmm1, %xmm2
# CHECK-NEXT: 4 2 2.00 vpcmpistri $1, %xmm0, %xmm2
# CHECK-NEXT: 4 9 2.00 * vpcmpistri $1, (%rax), %xmm2
# CHECK-NEXT: 3 6 2.00 vpcmpistrm $1, %xmm0, %xmm2
# CHECK-NEXT: 4 13 2.00 * vpcmpistrm $1, (%rax), %xmm2
# CHECK-NEXT: 3 7 2.00 vpcmpistrm $1, %xmm0, %xmm2
# CHECK-NEXT: 4 14 2.00 * vpcmpistrm $1, (%rax), %xmm2
# CHECK-NEXT: 1 3 1.00 vperm2f128 $1, %ymm0, %ymm1, %ymm2
# CHECK-NEXT: 1 10 1.00 * vperm2f128 $1, (%rax), %ymm1, %ymm2
# CHECK-NEXT: 1 1 0.50 vpermilpd $1, %xmm0, %xmm2
Expand Down Expand Up @@ -1749,7 +1749,7 @@ vzeroupper

# CHECK: Resource pressure per iteration:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 205.25 393.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00
# CHECK-NEXT: 1.33 1.33 1.33 16.50 16.50 16.50 16.50 - 204.25 392.58 268.08 158.08 208.50 208.50 65.00 119.67 119.67 119.67 107.00 107.00 107.00 19.00 19.00

# CHECK: Resource pressure by instruction:
# CHECK-NEXT: [0] [1] [2] [3] [4] [5] [6] [7] [8] [9] [10] [11] [12.0] [12.1] [13] [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
Expand Down Expand Up @@ -2126,8 +2126,8 @@ vzeroupper
# CHECK-NEXT: - - - - - - - - 0.50 - - 0.50 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendvb %xmm3, (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpblendw $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpblendw $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 2.00 2.00 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - - - - - - - - - - - - vpclmulqdq $11, %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 1.50 1.50 - - 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpclmulqdq $11, (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqb %xmm0, %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 0.50 0.50 - 0.33 0.33 0.33 0.33 0.33 0.33 - - vpcmpeqb (%rax), %xmm1, %xmm2
# CHECK-NEXT: - - - - - - - - 0.25 0.25 0.25 0.25 - - - - - - - - - - - vpcmpeqd %xmm0, %xmm1, %xmm2
Expand Down
Loading