diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td index c5478dd9fc13d..b8ae4e5082543 100644 --- a/llvm/lib/Target/X86/X86ScheduleZnver4.td +++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td @@ -9,23 +9,23 @@ // This file defines the machine model for Znver4 to support instruction // scheduling and other instruction cost heuristics. // Based on: -// * AMD Software Optimization Guide for AMD Family 19h Processors. -// https://www.amd.com/system/files/TechDocs/56665.zip +// * AMD Software Optimization Guide for the AMD Zen4 Microarchitecture. +// https://www.amd.com/system/files/TechDocs/57647.zip //===----------------------------------------------------------------------===// def Znver4Model : SchedMachineModel { - // AMD SOG 19h, 2.9.6 Dispatch + // AMD SOG Zen4, 2.9.6 Dispatch // The processor may dispatch up to 6 macro ops per cycle // into the execution engine. let IssueWidth = 6; - // AMD SOG 19h, 2.10.3 + // AMD SOG Zen4, 2.10.3 // The retire control unit (RCU) tracks the completion status of all // outstanding operations (integer, load/store, and floating-point) and is // the final arbiter for exception processing and recovery. // The unit can receive up to 6 macro ops dispatched per cycle and track up // to 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. let MicroOpBufferSize = 320; - // AMD SOG 19h, 2.9.1 Op Cache + // AMD SOG Zen4, 2.9.1 Op Cache // The op cache is organized as an associative cache with 64 sets and 8 ways. // At each set-way intersection is an entry containing up to 8 macro ops. // The maximum capacity of the op cache is 6.75K ops. @@ -33,13 +33,13 @@ def Znver4Model : SchedMachineModel { // the op-cache, we limit the loop buffer to 9*12 = 108 to avoid loop // unrolling leading to excessive filling of the op-cache from frontend. let LoopMicroOpBufferSize = 108; - // AMD SOG 19h, 2.6.2 L1 Data Cache + // AMD SOG Zen4, 2.6.2 L1 Data Cache // The L1 data cache has a 4- or 5- cycle integer load-to-use latency. - // AMD SOG 19h, 2.12 L1 Data Cache + // AMD SOG Zen4, 2.12 L1 Data Cache // The AGU and LS pipelines are optimized for simple address generation modes. // <...> and can achieve 4-cycle load-to-use integer load latency. let LoadLatency = 4; - // AMD SOG 19h, 2.12 L1 Data Cache + // AMD SOG Zen4, 2.12 L1 Data Cache // The AGU and LS pipelines are optimized for simple address generation modes. // <...> and can achieve <...> 7-cycle load-to-use FP load latency. int VecLoadLatency = 7; @@ -47,7 +47,7 @@ def Znver4Model : SchedMachineModel { int StoreLatency = 1; // FIXME: let HighLatency = 25; // FIXME: any better choice? - // AMD SOG 19h, 2.8 Optimizing Branching + // AMD SOG Zen4, 2.8 Optimizing Branching // The branch misprediction penalty is in the range from 11 to 18 cycles, // <...>. The common case penalty is 13 cycles. let MispredictPenalty = 13; @@ -64,17 +64,17 @@ let SchedModel = Znver4Model in { // RCU //===----------------------------------------------------------------------===// -// AMD SOG 19h, 2.10.3 Retire Control Unit +// AMD SOG Zen4, 2.10.3 Retire Control Unit // The unit can receive up to 6 macro ops dispatched per cycle and track up to -// 320 macro ops in-flight in non-SMT mode or 128 per thread in SMT mode. <...> -// The retire unit handles in-order commit of up to nine macro ops per cycle. -def Zn4RCU : RetireControlUnit; +// 320 macro ops in-flight in non-SMT mode or 160 per thread in SMT mode. <...> +// The retire unit handles in-order commit of up to eight macro ops per cycle. +def Zn4RCU : RetireControlUnit; //===----------------------------------------------------------------------===// // Integer Execution Unit // -// AMD SOG 19h, 2.4 Superscalar Organization +// AMD SOG Zen4, 2.4 Superscalar Organization // The processor uses four decoupled independent integer scheduler queues, // each one servicing one ALU pipeline and one or two other pipelines @@ -82,7 +82,7 @@ def Zn4RCU : RetireControlUnit; // Execution pipes //===----------------------------------------------------------------------===// -// AMD SOG 19h, 2.10.2 Execution Units +// AMD SOG Zen4, 2.10.2 Execution Units // The processor contains 4 general purpose integer execution pipes. // Each pipe has an ALU capable of general purpose integer operations. def Zn4ALU0 : ProcResource<1>; @@ -90,11 +90,11 @@ def Zn4ALU1 : ProcResource<1>; def Zn4ALU2 : ProcResource<1>; def Zn4ALU3 : ProcResource<1>; -// AMD SOG 19h, 2.10.2 Execution Units +// AMD SOG Zen4, 2.10.2 Execution Units // There is also a separate branch execution unit. def Zn4BRU1 : ProcResource<1>; -// AMD SOG 19h, 2.10.2 Execution Units +// AMD SOG Zen4, 2.10.2 Execution Units // There are three Address Generation Units (AGUs) for all load and store // address generation. There are also 3 store data movement units // associated with the same schedulers as the AGUs. @@ -106,11 +106,11 @@ def Zn4AGU2 : ProcResource<1>; // Execution Units //===----------------------------------------------------------------------===// -// AMD SOG 19h, 2.10.2 Execution Units +// AMD SOG Zen4, 2.10.2 Execution Units // ALU0 additionally has divide <...> execution capability. defvar Zn4Divider = Zn4ALU0; -// AMD SOG 19h, 2.10.2 Execution Units +// AMD SOG Zen4, 2.10.2 Execution Units // ALU0 additionally has <...> branch execution capability. defvar Zn4BRU0 = Zn4ALU0; @@ -143,14 +143,14 @@ def Zn4ALU12 : ProcResGroup<[Zn4ALU1, Zn4ALU2]>; // Scheduling //===----------------------------------------------------------------------===// -// AMD SOG 19h, 2.10.3 Retire Control Unit +// AMD SOG Zen4, 2.10.3 Retire Control Unit // The integer physical register file (PRF) consists of 224 registers. def Zn4IntegerPRF : RegisterFile<224, [GR64, CCR], [1, 1], [1, 0], 6, // Max moves that can be eliminated per cycle. 0>; // Restrict move elimination to zero regs. // anandtech, The integer scheduler has a 4*24 entry macro op capacity. -// AMD SOG 19h, 2.10.1 Schedulers +// AMD SOG Zen4, 2.10.1 Schedulers // The schedulers can receive up to six macro ops per cycle, with a limit of // two per scheduler. Each scheduler can issue one micro op per cycle into // each of its associated pipelines @@ -167,7 +167,7 @@ def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0 // Floating-Point Unit // -// AMD SOG 19h, 2.4 Superscalar Organization +// AMD SOG Zen4, 2.4 Superscalar Organization // The processor uses <...> two decoupled independent floating point schedulers // each servicing two FP pipelines and one store or FP-to-integer pipeline. @@ -175,7 +175,7 @@ def Zn4Int : ProcResGroup<[Zn4ALU0, Zn4AGU0, Zn4BRU0, // scheduler 0 // Execution pipes //===----------------------------------------------------------------------===// -// AMD SOG 19h, 2.10.1 Schedulers +// AMD SOG Zen4, 2.4 Superscalar Organization // <...>, and six FPU pipes. // Agner, 22.10 Floating point execution pipes // There are six floating point/vector execution pipes, @@ -188,7 +188,7 @@ def Zn4FP45 : ProcResource<2>; // // Execution Units //===----------------------------------------------------------------------===// -// AMD SOG 19h, 2.11.1 Floating Point Execution Resources +// AMD SOG Zen4, 2.11.1 Floating Point Execution Resources // (v)FMUL*, (v)FMA*, Floating Point Compares, Blendv(DQ) defvar Zn4FPFMul0 = Zn4FP0; @@ -203,7 +203,7 @@ defvar Zn4FPFCvt0 = Zn4FP2; defvar Zn4FPFCvt1 = Zn4FP3; // All Divide and Square Root except Reciprocal Approximation -// AMD SOG 19h, 2.11.1 Floating Point Execution Resources +// AMD SOG Zen4, 2.11.1 Floating Point Execution Resources // FDIV unit can support 2 simultaneous operations in flight // even though it occupies a single pipe. // FIXME: BufferSize=2 ? @@ -252,7 +252,7 @@ defvar Zn4FPCLM1 = Zn4FP1; // Execution pipeline grouping //===----------------------------------------------------------------------===// -// AMD SOG 19h, 2.11 Floating-Point Unit +// AMD SOG Zen4, 2.11 Floating-Point Unit // Stores and floating point to general purpose register transfer // have 2 dedicated pipelines (pipe 5 and 6). def Zn4FPU0123 : ProcResGroup<[Zn4FP0, Zn4FP1, Zn4FP2, Zn4FP3]>; @@ -281,12 +281,12 @@ def Zn4FPFMisc23 : ProcResGroup<[Zn4FPFMisc2, Zn4FPFMisc3]>; def Zn4FPFMisc123 : ProcResGroup<[Zn4FPFMisc1,Zn4FPFMisc2, Zn4FPFMisc3]>; // Loads, Stores and Move to General Register (EX) Operations -// AMD SOG 19h, 2.11 Floating-Point Unit +// AMD SOG Zen4, 2.11 Floating-Point Unit // Stores and floating point to general purpose register transfer // have 2 dedicated pipelines (pipe 5 and 6). defvar Zn4FPLd01 = Zn4FP45; -// AMD SOG 19h, 2.11 Floating-Point Unit +// AMD SOG Zen4, 2.11 Floating-Point Unit // Note that FP stores are supported on two pipelines, // but throughput is limited to one per cycle. let Super = Zn4FP45 in @@ -334,9 +334,9 @@ def Zn4FpPRF : RegisterFile<192, [VR64, VR128, VR256, VR512], [1, 1, 1, 1], [0, 6, // Max moves that can be eliminated per cycle. 0>; // Restrict move elimination to zero regs. -// AMD SOG 19h, 2.11 Floating-Point Unit +// AMD SOG Zen4, 2.11 Floating-Point Unit // The floating-point scheduler has a 2*32 entry macro op capacity. -// AMD SOG 19h, 2.11 Floating-Point Unit +// AMD SOG Zen4, 2.11 Floating-Point Unit // <...> the scheduler can issue 1 micro op per cycle for each pipe. // FIXME: those are two separate schedulers, not a single big one. def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 @@ -345,7 +345,7 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 let BufferSize = !mul(2, 32); } -// AMD SOG 19h, 2.11 Floating-Point Unit +// AMD SOG Zen4, 2.11 Floating-Point Unit // Macro ops can be dispatched to the 64 entry Non Scheduling Queue (NSQ) // even if floating-point scheduler is full. // FIXME: how to model this properly? @@ -355,27 +355,27 @@ def Zn4FP : ProcResGroup<[Zn4FP0, Zn4FP2, /*Zn4FP4,*/ // scheduler 0 // Load-Store Unit // -// AMD SOG 19h, 2.12 Load-Store Unit +// AMD SOG Zen4, 2.12 Load-Store Unit // The LS unit contains three largely independent pipe-lines -// enabling the execution of three 256-bit memory operations per cycle. +// enabling the execution of three memory operations per cycle. def Zn4LSU : ProcResource<3>; -// AMD SOG 19h, 2.12 Load-Store Unit +// AMD SOG Zen4, 2.12 Load-Store Unit // All three memory operations can be loads. let Super = Zn4LSU in def Zn4Load : ProcResource<3> { - // AMD SOG 19h, 2.12 Load-Store Unit - // The LS unit can process up to 72 out-of-order loads. - let BufferSize = 72; + // AMD SOG Zen4, 2.12 Load-Store Unit + // The LS can track up to 48 uncompleted loads and up to 88 completed loads. + let BufferSize = 88; } def Zn4LoadQueue : LoadQueue; -// AMD SOG 19h, 2.12 Load-Store Unit +// AMD SOG Zen4, 2.12 Load-Store Unit // A maximum of two of the memory operations can be stores. let Super = Zn4LSU in def Zn4Store : ProcResource<2> { - // AMD SOG 19h, 2.12 Load-Store Unit + // AMD SOG Zen4, 2.12 Load-Store Unit // The LS unit utilizes a 64-entry store queue (STQ). let BufferSize = 64; } @@ -491,7 +491,7 @@ def : ReadAdvance; def : ReadAdvance; def : ReadAdvance; -// AMD SOG 19h, 2.11 Floating-Point Unit +// AMD SOG Zen4, 2.11 Floating-Point Unit // There is 1 cycle of added latency for a result to cross // from F to I or I to F domain. def : ReadAdvance; diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/zero-idioms.s b/llvm/test/tools/llvm-mca/X86/Znver4/zero-idioms.s index 30df2ed0a926d..fdac31ca75c57 100644 --- a/llvm/test/tools/llvm-mca/X86/Znver4/zero-idioms.s +++ b/llvm/test/tools/llvm-mca/X86/Znver4/zero-idioms.s @@ -521,7 +521,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,11] .D---R . . . . vpcmpgtb %xmm3, %xmm3, %xmm3 # CHECK-NEXT: [0,12] . D--R . . . . vpcmpgtd %xmm3, %xmm3, %xmm3 # CHECK-NEXT: [0,13] . D--R . . . . vpcmpgtq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,14] . D--R . . . . vpcmpgtw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,14] . D---R . . . . vpcmpgtw %xmm3, %xmm3, %xmm3 # CHECK-NEXT: [0,15] . D---R . . . . vpcmpgtb %xmm3, %xmm3, %xmm5 # CHECK-NEXT: [0,16] . D---R . . . . vpcmpgtd %xmm3, %xmm3, %xmm5 # CHECK-NEXT: [0,17] . D---R . . . . vpcmpgtq %xmm3, %xmm3, %xmm5 @@ -529,8 +529,8 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,19] . D--R . . . . vpcmpgtb %ymm3, %ymm3, %ymm3 # CHECK-NEXT: [0,20] . D--R . . . . vpcmpgtd %ymm3, %ymm3, %ymm3 # CHECK-NEXT: [0,21] . D--R . . . . vpcmpgtq %ymm3, %ymm3, %ymm3 -# CHECK-NEXT: [0,22] . D--R . . . . vpcmpgtw %ymm3, %ymm3, %ymm3 -# CHECK-NEXT: [0,23] . D--R . . . . vpcmpgtb %ymm3, %ymm3, %ymm5 +# CHECK-NEXT: [0,22] . D---R . . . . vpcmpgtw %ymm3, %ymm3, %ymm3 +# CHECK-NEXT: [0,23] . D---R . . . . vpcmpgtb %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,24] . D--R . . . . vpcmpgtd %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,25] . D--R . . . . vpcmpgtq %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,26] . D--R . . . . vpcmpgtw %ymm3, %ymm3, %ymm5 @@ -545,7 +545,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,35] . D----R . . . vpsubb %xmm3, %xmm3, %xmm3 # CHECK-NEXT: [0,36] . .D---R . . . vpsubd %xmm3, %xmm3, %xmm3 # CHECK-NEXT: [0,37] . .D---R . . . vpsubq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: [0,38] . .D---R . . . vpsubw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: [0,38] . .D----R . . . vpsubw %xmm3, %xmm3, %xmm3 # CHECK-NEXT: [0,39] . .D----R . . . vpsubb %ymm3, %ymm3, %ymm3 # CHECK-NEXT: [0,40] . .D----R . . . vpsubd %ymm3, %ymm3, %ymm3 # CHECK-NEXT: [0,41] . .D----R . . . vpsubq %ymm3, %ymm3, %ymm3 @@ -553,43 +553,43 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,43] . . D---R . . . vpsubb %xmm3, %xmm3, %xmm5 # CHECK-NEXT: [0,44] . . D---R . . . vpsubd %xmm3, %xmm3, %xmm5 # CHECK-NEXT: [0,45] . . D---R . . . vpsubq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,46] . . D---R . . . vpsubw %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,47] . . D---R . . . vpsubb %ymm3, %ymm3, %ymm5 +# CHECK-NEXT: [0,46] . . D----R . . . vpsubw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,47] . . D----R . . . vpsubb %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,48] . . D---R . . . vpsubd %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,49] . . D---R . . . vpsubq %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,50] . . D---R . . . vpsubw %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,51] . . D---R . . . vpsubb %xmm19, %xmm19, %xmm19 # CHECK-NEXT: [0,52] . . D---R . . . vpsubd %xmm19, %xmm19, %xmm19 # CHECK-NEXT: [0,53] . . D---R . . . vpsubq %xmm19, %xmm19, %xmm19 -# CHECK-NEXT: [0,54] . . D--R . . . vpsubw %xmm19, %xmm19, %xmm19 -# CHECK-NEXT: [0,55] . . D--R . . . vpsubb %ymm19, %ymm19, %ymm19 -# CHECK-NEXT: [0,56] . . D--R . . . vpsubd %ymm19, %ymm19, %ymm19 +# CHECK-NEXT: [0,54] . . D---R . . . vpsubw %xmm19, %xmm19, %xmm19 +# CHECK-NEXT: [0,55] . . D---R . . . vpsubb %ymm19, %ymm19, %ymm19 +# CHECK-NEXT: [0,56] . . D---R . . . vpsubd %ymm19, %ymm19, %ymm19 # CHECK-NEXT: [0,57] . . D---R . . . vpsubq %ymm19, %ymm19, %ymm19 # CHECK-NEXT: [0,58] . . D---R . . . vpsubw %ymm19, %ymm19, %ymm19 # CHECK-NEXT: [0,59] . . D---R . . . vpsubb %zmm19, %zmm19, %zmm19 # CHECK-NEXT: [0,60] . . D--R . . . vpsubd %zmm19, %zmm19, %zmm19 # CHECK-NEXT: [0,61] . . D--R . . . vpsubq %zmm19, %zmm19, %zmm19 -# CHECK-NEXT: [0,62] . . D--R . . . vpsubw %zmm19, %zmm19, %zmm19 -# CHECK-NEXT: [0,63] . . D--R . . . vpsubb %xmm19, %xmm19, %xmm21 -# CHECK-NEXT: [0,64] . . D--R . . . vpsubd %xmm19, %xmm19, %xmm21 -# CHECK-NEXT: [0,65] . . D--R . . . vpsubq %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: [0,62] . . D---R. . . vpsubw %zmm19, %zmm19, %zmm19 +# CHECK-NEXT: [0,63] . . D---R. . . vpsubb %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: [0,64] . . D---R. . . vpsubd %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: [0,65] . . D---R. . . vpsubq %xmm19, %xmm19, %xmm21 # CHECK-NEXT: [0,66] . . .D--R. . . vpsubw %xmm19, %xmm19, %xmm21 # CHECK-NEXT: [0,67] . . .D--R. . . vpsubb %ymm19, %ymm19, %ymm21 # CHECK-NEXT: [0,68] . . .D--R. . . vpsubd %ymm19, %ymm19, %ymm21 # CHECK-NEXT: [0,69] . . .D--R. . . vpsubq %ymm19, %ymm19, %ymm21 -# CHECK-NEXT: [0,70] . . .D--R. . . vpsubw %ymm19, %ymm19, %ymm21 -# CHECK-NEXT: [0,71] . . .D--R. . . vpsubb %zmm19, %zmm19, %zmm21 -# CHECK-NEXT: [0,72] . . . D-R. . . vpsubd %zmm19, %zmm19, %zmm21 -# CHECK-NEXT: [0,73] . . . D-R. . . vpsubq %zmm19, %zmm19, %zmm21 -# CHECK-NEXT: [0,74] . . . D-R. . . vpsubw %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: [0,70] . . .D---R . . vpsubw %ymm19, %ymm19, %ymm21 +# CHECK-NEXT: [0,71] . . .D---R . . vpsubb %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: [0,72] . . . D--R . . vpsubd %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: [0,73] . . . D--R . . vpsubq %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: [0,74] . . . D--R . . vpsubw %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,75] . . . DeER . . andnps %xmm0, %xmm0 # CHECK-NEXT: [0,76] . . . DeER . . andnpd %xmm1, %xmm1 # CHECK-NEXT: [0,77] . . . D--R . . vandnps %xmm2, %xmm2, %xmm2 -# CHECK-NEXT: [0,78] . . . D-R . . vandnpd %xmm1, %xmm1, %xmm1 -# CHECK-NEXT: [0,79] . . . D-R . . vandnps %ymm2, %ymm2, %ymm2 -# CHECK-NEXT: [0,80] . . . D-R . . vandnpd %ymm1, %ymm1, %ymm1 -# CHECK-NEXT: [0,81] . . . D-R . . vandnps %zmm2, %zmm2, %zmm2 -# CHECK-NEXT: [0,82] . . . D-R . . vandnpd %zmm1, %zmm1, %zmm1 +# CHECK-NEXT: [0,78] . . . D--R . . vandnpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: [0,79] . . . D--R . . vandnps %ymm2, %ymm2, %ymm2 +# CHECK-NEXT: [0,80] . . . D--R . . vandnpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: [0,81] . . . D--R . . vandnps %zmm2, %zmm2, %zmm2 +# CHECK-NEXT: [0,82] . . . D--R . . vandnpd %zmm1, %zmm1, %zmm1 # CHECK-NEXT: [0,83] . . . DeER . . pandn %mm2, %mm2 # CHECK-NEXT: [0,84] . . . DeER . . pandn %xmm2, %xmm2 # CHECK-NEXT: [0,85] . . . D--R . . vpandn %xmm3, %xmm3, %xmm3 @@ -599,7 +599,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,89] . . . D--R . . vpandnd %ymm19, %ymm19, %ymm19 # CHECK-NEXT: [0,90] . . . D-R . . vpandnq %ymm19, %ymm19, %ymm19 # CHECK-NEXT: [0,91] . . . D-R . . vpandnd %zmm19, %zmm19, %zmm19 -# CHECK-NEXT: [0,92] . . . D-R . . vpandnq %zmm19, %zmm19, %zmm19 +# CHECK-NEXT: [0,92] . . . D--R . . vpandnq %zmm19, %zmm19, %zmm19 # CHECK-NEXT: [0,93] . . . D--R . . vandnps %xmm2, %xmm2, %xmm5 # CHECK-NEXT: [0,94] . . . D--R . . vandnpd %xmm1, %xmm1, %xmm5 # CHECK-NEXT: [0,95] . . . D--R . . vpandn %xmm3, %xmm3, %xmm5 @@ -607,8 +607,8 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,97] . . . .D-R . . vandnpd %ymm1, %ymm1, %ymm5 # CHECK-NEXT: [0,98] . . . .D-R . . vpandn %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,99] . . . .D-R . . vandnps %zmm2, %zmm2, %zmm5 -# CHECK-NEXT: [0,100] . . . .D-R . . vandnpd %zmm1, %zmm1, %zmm5 -# CHECK-NEXT: [0,101] . . . .D-R . . vpandnd %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: [0,100] . . . .D--R. . vandnpd %zmm1, %zmm1, %zmm5 +# CHECK-NEXT: [0,101] . . . .D--R. . vpandnd %xmm19, %xmm19, %xmm21 # CHECK-NEXT: [0,102] . . . . D-R. . vpandnq %xmm19, %xmm19, %xmm21 # CHECK-NEXT: [0,103] . . . . D-R. . vpandnd %ymm19, %ymm19, %ymm21 # CHECK-NEXT: [0,104] . . . . D-R. . vpandnq %ymm19, %ymm19, %ymm21 @@ -630,7 +630,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,120] . . . . D-R . vpxorq %xmm19, %xmm19, %xmm19 # CHECK-NEXT: [0,121] . . . . D-R . vpxord %ymm19, %ymm19, %ymm19 # CHECK-NEXT: [0,122] . . . . D-R . vpxorq %ymm19, %ymm19, %ymm19 -# CHECK-NEXT: [0,123] . . . . D-R . vpxord %zmm19, %zmm19, %zmm19 +# CHECK-NEXT: [0,123] . . . . D--R. vpxord %zmm19, %zmm19, %zmm19 # CHECK-NEXT: [0,124] . . . . D--R. vpxorq %zmm19, %zmm19, %zmm19 # CHECK-NEXT: [0,125] . . . . D--R. vxorps %xmm4, %xmm4, %xmm5 # CHECK-NEXT: [0,126] . . . . .D-R. vxorpd %xmm1, %xmm1, %xmm3 @@ -638,8 +638,8 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: [0,128] . . . . .D-R. vxorpd %ymm1, %ymm1, %ymm3 # CHECK-NEXT: [0,129] . . . . .D-R. vxorps %zmm4, %zmm4, %zmm5 # CHECK-NEXT: [0,130] . . . . .D-R. vxorpd %zmm1, %zmm1, %zmm3 -# CHECK-NEXT: [0,131] . . . . .D-R. vpxor %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: [0,132] . . . . . DR. vpxor %ymm3, %ymm3, %ymm5 +# CHECK-NEXT: [0,131] . . . . .D--R vpxor %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: [0,132] . . . . . D-R vpxor %ymm3, %ymm3, %ymm5 # CHECK-NEXT: [0,133] . . . . . D-R vpxord %xmm19, %xmm19, %xmm21 # CHECK-NEXT: [0,134] . . . . . D-R vpxorq %xmm19, %xmm19, %xmm21 # CHECK-NEXT: [0,135] . . . . . D-R vpxord %ymm19, %ymm19, %ymm21 @@ -668,7 +668,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 11. 1 0.0 0.0 3.0 vpcmpgtb %xmm3, %xmm3, %xmm3 # CHECK-NEXT: 12. 1 0.0 0.0 2.0 vpcmpgtd %xmm3, %xmm3, %xmm3 # CHECK-NEXT: 13. 1 0.0 0.0 2.0 vpcmpgtq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 14. 1 0.0 0.0 2.0 vpcmpgtw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 14. 1 0.0 0.0 3.0 vpcmpgtw %xmm3, %xmm3, %xmm3 # CHECK-NEXT: 15. 1 0.0 0.0 3.0 vpcmpgtb %xmm3, %xmm3, %xmm5 # CHECK-NEXT: 16. 1 0.0 0.0 3.0 vpcmpgtd %xmm3, %xmm3, %xmm5 # CHECK-NEXT: 17. 1 0.0 0.0 3.0 vpcmpgtq %xmm3, %xmm3, %xmm5 @@ -676,8 +676,8 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 19. 1 0.0 0.0 2.0 vpcmpgtb %ymm3, %ymm3, %ymm3 # CHECK-NEXT: 20. 1 0.0 0.0 2.0 vpcmpgtd %ymm3, %ymm3, %ymm3 # CHECK-NEXT: 21. 1 0.0 0.0 2.0 vpcmpgtq %ymm3, %ymm3, %ymm3 -# CHECK-NEXT: 22. 1 0.0 0.0 2.0 vpcmpgtw %ymm3, %ymm3, %ymm3 -# CHECK-NEXT: 23. 1 0.0 0.0 2.0 vpcmpgtb %ymm3, %ymm3, %ymm5 +# CHECK-NEXT: 22. 1 0.0 0.0 3.0 vpcmpgtw %ymm3, %ymm3, %ymm3 +# CHECK-NEXT: 23. 1 0.0 0.0 3.0 vpcmpgtb %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 24. 1 0.0 0.0 2.0 vpcmpgtd %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 25. 1 0.0 0.0 2.0 vpcmpgtq %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 26. 1 0.0 0.0 2.0 vpcmpgtw %ymm3, %ymm3, %ymm5 @@ -692,7 +692,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 35. 1 0.0 0.0 4.0 vpsubb %xmm3, %xmm3, %xmm3 # CHECK-NEXT: 36. 1 0.0 0.0 3.0 vpsubd %xmm3, %xmm3, %xmm3 # CHECK-NEXT: 37. 1 0.0 0.0 3.0 vpsubq %xmm3, %xmm3, %xmm3 -# CHECK-NEXT: 38. 1 0.0 0.0 3.0 vpsubw %xmm3, %xmm3, %xmm3 +# CHECK-NEXT: 38. 1 0.0 0.0 4.0 vpsubw %xmm3, %xmm3, %xmm3 # CHECK-NEXT: 39. 1 0.0 0.0 4.0 vpsubb %ymm3, %ymm3, %ymm3 # CHECK-NEXT: 40. 1 0.0 0.0 4.0 vpsubd %ymm3, %ymm3, %ymm3 # CHECK-NEXT: 41. 1 0.0 0.0 4.0 vpsubq %ymm3, %ymm3, %ymm3 @@ -700,43 +700,43 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 43. 1 0.0 0.0 3.0 vpsubb %xmm3, %xmm3, %xmm5 # CHECK-NEXT: 44. 1 0.0 0.0 3.0 vpsubd %xmm3, %xmm3, %xmm5 # CHECK-NEXT: 45. 1 0.0 0.0 3.0 vpsubq %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 46. 1 0.0 0.0 3.0 vpsubw %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 47. 1 0.0 0.0 3.0 vpsubb %ymm3, %ymm3, %ymm5 +# CHECK-NEXT: 46. 1 0.0 0.0 4.0 vpsubw %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 47. 1 0.0 0.0 4.0 vpsubb %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 48. 1 0.0 0.0 3.0 vpsubd %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 49. 1 0.0 0.0 3.0 vpsubq %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 50. 1 0.0 0.0 3.0 vpsubw %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 51. 1 0.0 0.0 3.0 vpsubb %xmm19, %xmm19, %xmm19 # CHECK-NEXT: 52. 1 0.0 0.0 3.0 vpsubd %xmm19, %xmm19, %xmm19 # CHECK-NEXT: 53. 1 0.0 0.0 3.0 vpsubq %xmm19, %xmm19, %xmm19 -# CHECK-NEXT: 54. 1 0.0 0.0 2.0 vpsubw %xmm19, %xmm19, %xmm19 -# CHECK-NEXT: 55. 1 0.0 0.0 2.0 vpsubb %ymm19, %ymm19, %ymm19 -# CHECK-NEXT: 56. 1 0.0 0.0 2.0 vpsubd %ymm19, %ymm19, %ymm19 +# CHECK-NEXT: 54. 1 0.0 0.0 3.0 vpsubw %xmm19, %xmm19, %xmm19 +# CHECK-NEXT: 55. 1 0.0 0.0 3.0 vpsubb %ymm19, %ymm19, %ymm19 +# CHECK-NEXT: 56. 1 0.0 0.0 3.0 vpsubd %ymm19, %ymm19, %ymm19 # CHECK-NEXT: 57. 1 0.0 0.0 3.0 vpsubq %ymm19, %ymm19, %ymm19 # CHECK-NEXT: 58. 1 0.0 0.0 3.0 vpsubw %ymm19, %ymm19, %ymm19 # CHECK-NEXT: 59. 1 0.0 0.0 3.0 vpsubb %zmm19, %zmm19, %zmm19 # CHECK-NEXT: 60. 1 0.0 0.0 2.0 vpsubd %zmm19, %zmm19, %zmm19 # CHECK-NEXT: 61. 1 0.0 0.0 2.0 vpsubq %zmm19, %zmm19, %zmm19 -# CHECK-NEXT: 62. 1 0.0 0.0 2.0 vpsubw %zmm19, %zmm19, %zmm19 -# CHECK-NEXT: 63. 1 0.0 0.0 2.0 vpsubb %xmm19, %xmm19, %xmm21 -# CHECK-NEXT: 64. 1 0.0 0.0 2.0 vpsubd %xmm19, %xmm19, %xmm21 -# CHECK-NEXT: 65. 1 0.0 0.0 2.0 vpsubq %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: 62. 1 0.0 0.0 3.0 vpsubw %zmm19, %zmm19, %zmm19 +# CHECK-NEXT: 63. 1 0.0 0.0 3.0 vpsubb %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: 64. 1 0.0 0.0 3.0 vpsubd %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: 65. 1 0.0 0.0 3.0 vpsubq %xmm19, %xmm19, %xmm21 # CHECK-NEXT: 66. 1 0.0 0.0 2.0 vpsubw %xmm19, %xmm19, %xmm21 # CHECK-NEXT: 67. 1 0.0 0.0 2.0 vpsubb %ymm19, %ymm19, %ymm21 # CHECK-NEXT: 68. 1 0.0 0.0 2.0 vpsubd %ymm19, %ymm19, %ymm21 # CHECK-NEXT: 69. 1 0.0 0.0 2.0 vpsubq %ymm19, %ymm19, %ymm21 -# CHECK-NEXT: 70. 1 0.0 0.0 2.0 vpsubw %ymm19, %ymm19, %ymm21 -# CHECK-NEXT: 71. 1 0.0 0.0 2.0 vpsubb %zmm19, %zmm19, %zmm21 -# CHECK-NEXT: 72. 1 0.0 0.0 1.0 vpsubd %zmm19, %zmm19, %zmm21 -# CHECK-NEXT: 73. 1 0.0 0.0 1.0 vpsubq %zmm19, %zmm19, %zmm21 -# CHECK-NEXT: 74. 1 0.0 0.0 1.0 vpsubw %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: 70. 1 0.0 0.0 3.0 vpsubw %ymm19, %ymm19, %ymm21 +# CHECK-NEXT: 71. 1 0.0 0.0 3.0 vpsubb %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: 72. 1 0.0 0.0 2.0 vpsubd %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: 73. 1 0.0 0.0 2.0 vpsubq %zmm19, %zmm19, %zmm21 +# CHECK-NEXT: 74. 1 0.0 0.0 2.0 vpsubw %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 75. 1 1.0 1.0 0.0 andnps %xmm0, %xmm0 # CHECK-NEXT: 76. 1 1.0 1.0 0.0 andnpd %xmm1, %xmm1 # CHECK-NEXT: 77. 1 0.0 0.0 2.0 vandnps %xmm2, %xmm2, %xmm2 -# CHECK-NEXT: 78. 1 0.0 0.0 1.0 vandnpd %xmm1, %xmm1, %xmm1 -# CHECK-NEXT: 79. 1 0.0 0.0 1.0 vandnps %ymm2, %ymm2, %ymm2 -# CHECK-NEXT: 80. 1 0.0 0.0 1.0 vandnpd %ymm1, %ymm1, %ymm1 -# CHECK-NEXT: 81. 1 0.0 0.0 1.0 vandnps %zmm2, %zmm2, %zmm2 -# CHECK-NEXT: 82. 1 0.0 0.0 1.0 vandnpd %zmm1, %zmm1, %zmm1 +# CHECK-NEXT: 78. 1 0.0 0.0 2.0 vandnpd %xmm1, %xmm1, %xmm1 +# CHECK-NEXT: 79. 1 0.0 0.0 2.0 vandnps %ymm2, %ymm2, %ymm2 +# CHECK-NEXT: 80. 1 0.0 0.0 2.0 vandnpd %ymm1, %ymm1, %ymm1 +# CHECK-NEXT: 81. 1 0.0 0.0 2.0 vandnps %zmm2, %zmm2, %zmm2 +# CHECK-NEXT: 82. 1 0.0 0.0 2.0 vandnpd %zmm1, %zmm1, %zmm1 # CHECK-NEXT: 83. 1 1.0 1.0 0.0 pandn %mm2, %mm2 # CHECK-NEXT: 84. 1 1.0 1.0 0.0 pandn %xmm2, %xmm2 # CHECK-NEXT: 85. 1 0.0 0.0 2.0 vpandn %xmm3, %xmm3, %xmm3 @@ -746,7 +746,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 89. 1 0.0 0.0 2.0 vpandnd %ymm19, %ymm19, %ymm19 # CHECK-NEXT: 90. 1 0.0 0.0 1.0 vpandnq %ymm19, %ymm19, %ymm19 # CHECK-NEXT: 91. 1 0.0 0.0 1.0 vpandnd %zmm19, %zmm19, %zmm19 -# CHECK-NEXT: 92. 1 0.0 0.0 1.0 vpandnq %zmm19, %zmm19, %zmm19 +# CHECK-NEXT: 92. 1 0.0 0.0 2.0 vpandnq %zmm19, %zmm19, %zmm19 # CHECK-NEXT: 93. 1 0.0 0.0 2.0 vandnps %xmm2, %xmm2, %xmm5 # CHECK-NEXT: 94. 1 0.0 0.0 2.0 vandnpd %xmm1, %xmm1, %xmm5 # CHECK-NEXT: 95. 1 0.0 0.0 2.0 vpandn %xmm3, %xmm3, %xmm5 @@ -754,8 +754,8 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 97. 1 0.0 0.0 1.0 vandnpd %ymm1, %ymm1, %ymm5 # CHECK-NEXT: 98. 1 0.0 0.0 1.0 vpandn %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 99. 1 0.0 0.0 1.0 vandnps %zmm2, %zmm2, %zmm5 -# CHECK-NEXT: 100. 1 0.0 0.0 1.0 vandnpd %zmm1, %zmm1, %zmm5 -# CHECK-NEXT: 101. 1 0.0 0.0 1.0 vpandnd %xmm19, %xmm19, %xmm21 +# CHECK-NEXT: 100. 1 0.0 0.0 2.0 vandnpd %zmm1, %zmm1, %zmm5 +# CHECK-NEXT: 101. 1 0.0 0.0 2.0 vpandnd %xmm19, %xmm19, %xmm21 # CHECK-NEXT: 102. 1 0.0 0.0 1.0 vpandnq %xmm19, %xmm19, %xmm21 # CHECK-NEXT: 103. 1 0.0 0.0 1.0 vpandnd %ymm19, %ymm19, %ymm21 # CHECK-NEXT: 104. 1 0.0 0.0 1.0 vpandnq %ymm19, %ymm19, %ymm21 @@ -777,7 +777,7 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 120. 1 0.0 0.0 1.0 vpxorq %xmm19, %xmm19, %xmm19 # CHECK-NEXT: 121. 1 0.0 0.0 1.0 vpxord %ymm19, %ymm19, %ymm19 # CHECK-NEXT: 122. 1 0.0 0.0 1.0 vpxorq %ymm19, %ymm19, %ymm19 -# CHECK-NEXT: 123. 1 0.0 0.0 1.0 vpxord %zmm19, %zmm19, %zmm19 +# CHECK-NEXT: 123. 1 0.0 0.0 2.0 vpxord %zmm19, %zmm19, %zmm19 # CHECK-NEXT: 124. 1 0.0 0.0 2.0 vpxorq %zmm19, %zmm19, %zmm19 # CHECK-NEXT: 125. 1 0.0 0.0 2.0 vxorps %xmm4, %xmm4, %xmm5 # CHECK-NEXT: 126. 1 0.0 0.0 1.0 vxorpd %xmm1, %xmm1, %xmm3 @@ -785,12 +785,12 @@ vpxorq %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 128. 1 0.0 0.0 1.0 vxorpd %ymm1, %ymm1, %ymm3 # CHECK-NEXT: 129. 1 0.0 0.0 1.0 vxorps %zmm4, %zmm4, %zmm5 # CHECK-NEXT: 130. 1 0.0 0.0 1.0 vxorpd %zmm1, %zmm1, %zmm3 -# CHECK-NEXT: 131. 1 0.0 0.0 1.0 vpxor %xmm3, %xmm3, %xmm5 -# CHECK-NEXT: 132. 1 0.0 0.0 0.0 vpxor %ymm3, %ymm3, %ymm5 +# CHECK-NEXT: 131. 1 0.0 0.0 2.0 vpxor %xmm3, %xmm3, %xmm5 +# CHECK-NEXT: 132. 1 0.0 0.0 1.0 vpxor %ymm3, %ymm3, %ymm5 # CHECK-NEXT: 133. 1 0.0 0.0 1.0 vpxord %xmm19, %xmm19, %xmm21 # CHECK-NEXT: 134. 1 0.0 0.0 1.0 vpxorq %xmm19, %xmm19, %xmm21 # CHECK-NEXT: 135. 1 0.0 0.0 1.0 vpxord %ymm19, %ymm19, %ymm21 # CHECK-NEXT: 136. 1 0.0 0.0 1.0 vpxorq %ymm19, %ymm19, %ymm21 # CHECK-NEXT: 137. 1 0.0 0.0 1.0 vpxord %zmm19, %zmm19, %zmm21 # CHECK-NEXT: 138. 1 0.0 0.0 0.0 vpxorq %zmm19, %zmm19, %zmm21 -# CHECK-NEXT: 1 0.2 0.1 1.6 +# CHECK-NEXT: 1 0.2 0.1 1.8 diff --git a/llvm/test/tools/llvm-mca/X86/scheduler-queue-usage.s b/llvm/test/tools/llvm-mca/X86/scheduler-queue-usage.s index 2d5c0483de7df..360d9634216c8 100644 --- a/llvm/test/tools/llvm-mca/X86/scheduler-queue-usage.s +++ b/llvm/test/tools/llvm-mca/X86/scheduler-queue-usage.s @@ -176,5 +176,5 @@ xor %eax, %ebx # ZNVER4: [1] [2] [3] [4] # ZNVER4-NEXT: Zn4FP 0 0 64 # ZNVER4-NEXT: Zn4Int 0 1 96 -# ZNVER4-NEXT: Zn4Load 0 0 72 +# ZNVER4-NEXT: Zn4Load 0 0 88 # ZNVER4-NEXT: Zn4Store 0 0 64