Skip to content

Commit a044d61

Browse files
authored
[AArch64] Update IssueWidth for Neoverse V1, N1, N3 (#154495)
Recently the IssueWidth in the scheduling model was reduced for Neoverse-V2 and N2. This patch does the same for Neoverse-V1, N1 and N3. On Neoverse-V1, various values of IssueWidth (15, 8, 7, 6, 5) were tried with runs of various workloads. The highest overall geomean score was achieved with an issue width of 8. No significant regressions were noted. On Neoverse-N1, various values of IssueWidth (8, 6, 5, 4, 3) were tried with runs of various workloads. The highest overall geomean score was achieved with an issue width of 3. No significant regressions were noted. On Neoverse-N3, it makes sense to do exactly the same as was done for N2. It is proposed to use an issue width of 5. Related V2 PR: #142565 Related N2 PR: #145717
1 parent 46ad540 commit a044d61

12 files changed

+5759
-5733
lines changed

llvm/lib/Target/AArch64/AArch64SchedNeoverseN1.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,9 @@
1515
//===----------------------------------------------------------------------===//
1616

1717
def NeoverseN1Model : SchedMachineModel {
18-
let IssueWidth = 8; // Maximum micro-ops dispatch rate.
18+
let IssueWidth = 3; // This value comes from the decode bandwidth
19+
// and empirical measurements showed that this
20+
// value is better.
1921
let MicroOpBufferSize = 128; // NOTE: Copied from Cortex-A76.
2022
let LoadLatency = 4; // Optimistic load latency.
2123
let MispredictPenalty = 11; // Cycles cost of branch mispredicted.

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
//===----------------------------------------------------------------------===//
1212

1313
def NeoverseN3Model : SchedMachineModel {
14-
let IssueWidth = 10; // Micro-ops dispatched at a time.
14+
let IssueWidth = 5; // Micro-ops dispatched at a time.
1515
let MicroOpBufferSize = 160; // Entries in micro-op re-order buffer. NOTE: Copied from N2.
1616
let LoadLatency = 4; // Optimistic load latency.
1717
let MispredictPenalty = 10; // Extra cycles for mispredicted branch. NOTE: Copied from N2.

llvm/lib/Target/AArch64/AArch64SchedNeoverseV1.td

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
//===----------------------------------------------------------------------===//
2020

2121
def NeoverseV1Model : SchedMachineModel {
22-
let IssueWidth = 15; // Maximum micro-ops dispatch rate.
22+
let IssueWidth = 8; // This value comes from the decode bandwidth
23+
// and empirical measurements showed that this
24+
// value is better.
2325
let MicroOpBufferSize = 256; // Micro-op re-order buffer.
2426
let LoadLatency = 4; // Optimistic load latency.
2527
let MispredictPenalty = 11; // Cycles cost of branch mispredicted.

llvm/test/tools/llvm-mca/AArch64/Neoverse/N1-writeback.s

Lines changed: 2214 additions & 2203 deletions
Large diffs are not rendered by default.

llvm/test/tools/llvm-mca/AArch64/Neoverse/N3-writeback.s

Lines changed: 1916 additions & 1906 deletions
Large diffs are not rendered by default.

llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-basic-instructions.s

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2688,7 +2688,7 @@ drps
26882688
# CHECK-NEXT: 1 1 0.25 movk x7, #0, lsl #32
26892689
# CHECK-NEXT: 1 1 0.25 movz x8, #0, lsl #48
26902690
# CHECK-NEXT: 1 1 0.25 movk x9, #0, lsl #48
2691-
# CHECK-NEXT: 1 1 0.07 U msr DAIFSet, #0
2691+
# CHECK-NEXT: 1 1 0.12 U msr DAIFSet, #0
26922692
# CHECK-NEXT: 1 1 0.25 adr x2, #1600
26932693
# CHECK-NEXT: 1 1 0.25 adrp x21, #6553600
26942694
# CHECK-NEXT: 1 1 0.25 adr x0, #262144

llvm/test/tools/llvm-mca/AArch64/Neoverse/V1-clear-upper-regs.s

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -58,7 +58,7 @@ add v0.16b, v0.16b, v0.16b
5858
# CHECK-NEXT: Total Cycles: 41
5959
# CHECK-NEXT: Total uOps: 200
6060

61-
# CHECK: Dispatch Width: 15
61+
# CHECK: Dispatch Width: 8
6262
# CHECK-NEXT: uOps Per Cycle: 4.88
6363
# CHECK-NEXT: IPC: 4.88
6464
# CHECK-NEXT: Block RThroughput: 0.3
@@ -134,7 +134,7 @@ add v0.16b, v0.16b, v0.16b
134134
# CHECK-NEXT: Total Cycles: 44
135135
# CHECK-NEXT: Total uOps: 200
136136

137-
# CHECK: Dispatch Width: 15
137+
# CHECK: Dispatch Width: 8
138138
# CHECK-NEXT: uOps Per Cycle: 4.55
139139
# CHECK-NEXT: IPC: 4.55
140140
# CHECK-NEXT: Block RThroughput: 0.3
@@ -211,7 +211,7 @@ add v0.16b, v0.16b, v0.16b
211211
# CHECK-NEXT: Total Cycles: 44
212212
# CHECK-NEXT: Total uOps: 200
213213

214-
# CHECK: Dispatch Width: 15
214+
# CHECK: Dispatch Width: 8
215215
# CHECK-NEXT: uOps Per Cycle: 4.55
216216
# CHECK-NEXT: IPC: 4.55
217217
# CHECK-NEXT: Block RThroughput: 0.3
@@ -288,7 +288,7 @@ add v0.16b, v0.16b, v0.16b
288288
# CHECK-NEXT: Total Cycles: 44
289289
# CHECK-NEXT: Total uOps: 200
290290

291-
# CHECK: Dispatch Width: 15
291+
# CHECK: Dispatch Width: 8
292292
# CHECK-NEXT: uOps Per Cycle: 4.55
293293
# CHECK-NEXT: IPC: 4.55
294294
# CHECK-NEXT: Block RThroughput: 0.3
@@ -365,7 +365,7 @@ add v0.16b, v0.16b, v0.16b
365365
# CHECK-NEXT: Total Cycles: 44
366366
# CHECK-NEXT: Total uOps: 200
367367

368-
# CHECK: Dispatch Width: 15
368+
# CHECK: Dispatch Width: 8
369369
# CHECK-NEXT: uOps Per Cycle: 4.55
370370
# CHECK-NEXT: IPC: 4.55
371371
# CHECK-NEXT: Block RThroughput: 0.3
@@ -442,7 +442,7 @@ add v0.16b, v0.16b, v0.16b
442442
# CHECK-NEXT: Total Cycles: 44
443443
# CHECK-NEXT: Total uOps: 200
444444

445-
# CHECK: Dispatch Width: 15
445+
# CHECK: Dispatch Width: 8
446446
# CHECK-NEXT: uOps Per Cycle: 4.55
447447
# CHECK-NEXT: IPC: 4.55
448448
# CHECK-NEXT: Block RThroughput: 0.3
@@ -519,7 +519,7 @@ add v0.16b, v0.16b, v0.16b
519519
# CHECK-NEXT: Total Cycles: 44
520520
# CHECK-NEXT: Total uOps: 200
521521

522-
# CHECK: Dispatch Width: 15
522+
# CHECK: Dispatch Width: 8
523523
# CHECK-NEXT: uOps Per Cycle: 4.55
524524
# CHECK-NEXT: IPC: 4.55
525525
# CHECK-NEXT: Block RThroughput: 0.3
@@ -596,7 +596,7 @@ add v0.16b, v0.16b, v0.16b
596596
# CHECK-NEXT: Total Cycles: 44
597597
# CHECK-NEXT: Total uOps: 200
598598

599-
# CHECK: Dispatch Width: 15
599+
# CHECK: Dispatch Width: 8
600600
# CHECK-NEXT: uOps Per Cycle: 4.55
601601
# CHECK-NEXT: IPC: 4.55
602602
# CHECK-NEXT: Block RThroughput: 0.3
@@ -673,7 +673,7 @@ add v0.16b, v0.16b, v0.16b
673673
# CHECK-NEXT: Total Cycles: 403
674674
# CHECK-NEXT: Total uOps: 200
675675

676-
# CHECK: Dispatch Width: 15
676+
# CHECK: Dispatch Width: 8
677677
# CHECK-NEXT: uOps Per Cycle: 0.50
678678
# CHECK-NEXT: IPC: 0.50
679679
# CHECK-NEXT: Block RThroughput: 0.5
@@ -750,7 +750,7 @@ add v0.16b, v0.16b, v0.16b
750750
# CHECK-NEXT: Total Cycles: 1003
751751
# CHECK-NEXT: Total uOps: 300
752752

753-
# CHECK: Dispatch Width: 15
753+
# CHECK: Dispatch Width: 8
754754
# CHECK-NEXT: uOps Per Cycle: 0.30
755755
# CHECK-NEXT: IPC: 0.20
756756
# CHECK-NEXT: Block RThroughput: 0.5
@@ -805,9 +805,9 @@ add v0.16b, v0.16b, v0.16b
805805
# CHECK-NEXT: [1,0] D==========eeeeeeeeER . . . . . ld1 { v0.b }[0], [sp]
806806
# CHECK-NEXT: [1,1] D==================eeER . . . . . add v0.16b, v0.16b, v0.16b
807807
# CHECK-NEXT: [2,0] D====================eeeeeeeeER . . . ld1 { v0.b }[0], [sp]
808-
# CHECK-NEXT: [2,1] D============================eeER . . . add v0.16b, v0.16b, v0.16b
809-
# CHECK-NEXT: [3,0] D==============================eeeeeeeeER . ld1 { v0.b }[0], [sp]
810-
# CHECK-NEXT: [3,1] D======================================eeER add v0.16b, v0.16b, v0.16b
808+
# CHECK-NEXT: [2,1] .D===========================eeER . . . add v0.16b, v0.16b, v0.16b
809+
# CHECK-NEXT: [3,0] .D=============================eeeeeeeeER . ld1 { v0.b }[0], [sp]
810+
# CHECK-NEXT: [3,1] .D=====================================eeER add v0.16b, v0.16b, v0.16b
811811

812812
# CHECK: Average Wait times (based on the timeline view):
813813
# CHECK-NEXT: [0]: Executions
@@ -816,6 +816,6 @@ add v0.16b, v0.16b, v0.16b
816816
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
817817

818818
# CHECK: [0] [1] [2] [3]
819-
# CHECK-NEXT: 0. 4 16.0 0.3 0.0 ld1 { v0.b }[0], [sp]
820-
# CHECK-NEXT: 1. 4 24.0 0.0 0.0 add v0.16b, v0.16b, v0.16b
821-
# CHECK-NEXT: 4 20.0 0.1 0.0 <total>
819+
# CHECK-NEXT: 0. 4 15.8 0.3 0.0 ld1 { v0.b }[0], [sp]
820+
# CHECK-NEXT: 1. 4 23.5 0.0 0.0 add v0.16b, v0.16b, v0.16b
821+
# CHECK-NEXT: 4 19.6 0.1 0.0 <total>

0 commit comments

Comments
 (0)