Skip to content

Commit 2412421

Browse files
committed
[AArch64] Initial sched model for Neoverse V3, V3AE
Add the scheduling models for Neoverse V3 and Neoverse V3AE based on information taken from the V3 Software Optimization guide: https://developer.arm.com/documentation/109678/300/?lang=en and on information taken from the V3AE Software Optimization guide: https://developer.arm.com/documentation/109703/300/?lang=en Implements #134977 Change-Id: I2355c8a92c2350d55b670d6a2acb0e22e1cacc54
1 parent 73a42dd commit 2412421

File tree

6 files changed

+495
-504
lines changed

6 files changed

+495
-504
lines changed

llvm/lib/Target/AArch64/AArch64SchedNeoverseV3.td

Lines changed: 22 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -178,11 +178,6 @@ def V3Write_1c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
178178
let NumMicroOps = 2;
179179
}
180180

181-
def V3Write_2c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
182-
let Latency = 2;
183-
let NumMicroOps = 2;
184-
}
185-
186181
def V3Write_3c_2M : SchedWriteRes<[V3UnitM, V3UnitM]> {
187182
let Latency = 3;
188183
let NumMicroOps = 2;
@@ -659,13 +654,6 @@ def V3Write_2c_4SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
659654
let NumMicroOps = 8;
660655
}
661656

662-
def V3Write_4c_4SA_4V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
663-
V3UnitSA, V3UnitV01, V3UnitV01,
664-
V3UnitV01, V3UnitV01]> {
665-
let Latency = 4;
666-
let NumMicroOps = 8;
667-
}
668-
669657
def V3Write_6c_2SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitV01,
670658
V3UnitV01, V3UnitV01, V3UnitV01,
671659
V3UnitV01, V3UnitV01]> {
@@ -740,6 +728,14 @@ def V3Write_10c_4L_8V : SchedWriteRes<[V3UnitL, V3UnitL, V3UnitL,
740728
let NumMicroOps = 12;
741729
}
742730

731+
def V3Write_4c_6SA_6V01 : SchedWriteRes<[V3UnitSA, V3UnitSA, V3UnitSA,
732+
V3UnitSA, V3UnitSA, V3UnitSA,
733+
V3UnitV01, V3UnitV01, V3UnitV01,
734+
V3UnitV01, V3UnitV01, V3UnitV01]> {
735+
let Latency = 4;
736+
let NumMicroOps = 12;
737+
}
738+
743739
//===----------------------------------------------------------------------===//
744740
// Define generic 16 micro-op types
745741

@@ -970,8 +966,8 @@ def V3Rd_ZSA : SchedReadAdvance<3, [V3Wr_ZSA]>;
970966

971967
def V3Wr_ZDOTB : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
972968
def V3Rd_ZDOTB : SchedReadAdvance<2, [V3Wr_ZDOTB]>;
973-
def V3Wr_ZDOTH : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
974-
def V3Rd_ZDOTH : SchedReadAdvance<3, [V3Wr_ZDOTH]>;
969+
def V3Wr_ZDOTH : SchedWriteRes<[V3UnitV02]> { let Latency = 3; }
970+
def V3Rd_ZDOTH : SchedReadAdvance<2, [V3Wr_ZDOTH]>;
975971

976972
// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce
977973
// throughput to 1 in case of forwarding?
@@ -983,7 +979,7 @@ def V3Rd_ZCMAD : SchedReadAdvance<2, [V3Wr_ZCMAD]>;
983979
def V3Wr_ZMMA : SchedWriteRes<[V3UnitV]> { let Latency = 3; }
984980
def V3Rd_ZMMA : SchedReadAdvance<2, [V3Wr_ZMMA]>;
985981

986-
def V3Wr_ZMABHS : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 4; }
982+
def V3Wr_ZMABHS : SchedWriteRes<[V3UnitV02]> { let Latency = 4; }
987983
def V3Rd_ZMABHS : SchedReadAdvance<3, [V3Wr_ZMABHS]>;
988984
def V3Wr_ZMAD : SchedWriteRes<[V3UnitV02, V3UnitV02]> { let Latency = 5; }
989985
def V3Rd_ZMAD : SchedReadAdvance<2, [V3Wr_ZMAD]>;
@@ -2015,7 +2011,7 @@ def : InstRW<[V3Write_1c_1M], (instrs SEL_PPPP)>;
20152011
def : InstRW<[V3Write_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
20162012

20172013
// Predicate set/initialize, set flags
2018-
def : InstRW<[V3Write_2c_2M], (instregex "^PTRUES_[BHSD]")>;
2014+
def : InstRW<[V3Write_2c_1M], (instregex "^PTRUES_[BHSD]")>;
20192015

20202016
// Predicate find first/next
20212017
def : InstRW<[V3Write_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;
@@ -2706,41 +2702,41 @@ def : InstRW<[V3Write_2c_1SA_1I_1V01], (instrs STNT1H_ZRR)>;
27062702
def : InstRW<[V3Write_2c_1SA_1V01], (instregex "^STNT1[BWD]_ZRR$")>;
27072703

27082704
// Scatter non temporal store, vector + scalar 32-bit element size
2709-
def : InstRW<[V3Write_4c_4SA_4V01], (instregex "^STNT1[BHW]_ZZR_S")>;
2705+
def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^STNT1[BHW]_ZZR_S")>;
27102706

27112707
// Scatter non temporal store, vector + scalar 64-bit element size
2712-
def : InstRW<[V3Write_2c_2SA_2V01], (instregex "^STNT1[BHWD]_ZZR_D")>;
2708+
def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^STNT1[BHWD]_ZZR_D")>;
27132709

27142710
// Scatter store vector + imm 32-bit element size
2715-
def : InstRW<[V3Write_4c_4SA_4V01], (instregex "^SST1[BH]_S_IMM$",
2711+
def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_IMM$",
27162712
"^SST1W_IMM$")>;
27172713

27182714
// Scatter store vector + imm 64-bit element size
2719-
def : InstRW<[V3Write_2c_2SA_2V01], (instregex "^SST1[BHW]_D_IMM$",
2715+
def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_IMM$",
27202716
"^SST1D_IMM$")>;
27212717

27222718
// Scatter store, 32-bit scaled offset
2723-
def : InstRW<[V3Write_4c_4SA_4V01],
2719+
def : InstRW<[V3Write_4c_6SA_6V01],
27242720
(instregex "^SST1(H_S|W)_[SU]XTW_SCALED$")>;
27252721

27262722
// Scatter store, 32-bit unpacked unscaled offset
2727-
def : InstRW<[V3Write_2c_2SA_2V01], (instregex "^SST1[BHW]_D_[SU]XTW$",
2723+
def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D_[SU]XTW$",
27282724
"^SST1D_[SU]XTW$")>;
27292725

27302726
// Scatter store, 32-bit unpacked scaled offset
2731-
def : InstRW<[V3Write_2c_2SA_2V01], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
2727+
def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_[SU]XTW_SCALED$",
27322728
"^SST1D_[SU]XTW_SCALED$")>;
27332729

27342730
// Scatter store, 32-bit unscaled offset
2735-
def : InstRW<[V3Write_4c_4SA_4V01], (instregex "^SST1[BH]_S_[SU]XTW$",
2731+
def : InstRW<[V3Write_4c_6SA_6V01], (instregex "^SST1[BH]_S_[SU]XTW$",
27362732
"^SST1W_[SU]XTW$")>;
27372733

27382734
// Scatter store, 64-bit scaled offset
2739-
def : InstRW<[V3Write_2c_2SA_2V01], (instregex "^SST1[HW]_D_SCALED$",
2735+
def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[HW]_D_SCALED$",
27402736
"^SST1D_SCALED$")>;
27412737

27422738
// Scatter store, 64-bit unscaled offset
2743-
def : InstRW<[V3Write_2c_2SA_2V01], (instregex "^SST1[BHW]_D$",
2739+
def : InstRW<[V3Write_2c_3SA_3V01], (instregex "^SST1[BHW]_D$",
27442740
"^SST1D$")>;
27452741

27462742
// §3.30 SVE Miscellaneous instructions

llvm/lib/Target/AArch64/AArch64SchedNeoverseV3AE.td

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -166,11 +166,6 @@ def V3AEWrite_1c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
166166
let NumMicroOps = 2;
167167
}
168168

169-
def V3AEWrite_2c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
170-
let Latency = 2;
171-
let NumMicroOps = 2;
172-
}
173-
174169
def V3AEWrite_3c_2M : SchedWriteRes<[V3AEUnitM, V3AEUnitM]> {
175170
let Latency = 3;
176171
let NumMicroOps = 2;
@@ -899,8 +894,8 @@ def V3AERd_ZSA : SchedReadAdvance<3, [V3AEWr_ZSA]>;
899894

900895
def V3AEWr_ZDOTB : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
901896
def V3AERd_ZDOTB : SchedReadAdvance<2, [V3AEWr_ZDOTB]>;
902-
def V3AEWr_ZDOTH : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
903-
def V3AERd_ZDOTH : SchedReadAdvance<3, [V3AEWr_ZDOTH]>;
897+
def V3AEWr_ZDOTH : SchedWriteRes<[V3AEUnitV0]> { let Latency = 3; }
898+
def V3AERd_ZDOTH : SchedReadAdvance<2, [V3AEWr_ZDOTH]>;
904899

905900
// NOTE: SOG p. 43: Complex multiply-add B, H, S element size: How to reduce
906901
// throughput to 1 in case of forwarding?
@@ -912,7 +907,7 @@ def V3AERd_ZCMAD : SchedReadAdvance<2, [V3AEWr_ZCMAD]>;
912907
def V3AEWr_ZMMA : SchedWriteRes<[V3AEUnitV]> { let Latency = 3; }
913908
def V3AERd_ZMMA : SchedReadAdvance<2, [V3AEWr_ZMMA]>;
914909

915-
def V3AEWr_ZMABHS : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 4; }
910+
def V3AEWr_ZMABHS : SchedWriteRes<[V3AEUnitV0]> { let Latency = 4; }
916911
def V3AERd_ZMABHS : SchedReadAdvance<3, [V3AEWr_ZMABHS]>;
917912
def V3AEWr_ZMAD : SchedWriteRes<[V3AEUnitV0, V3AEUnitV0]> { let Latency = 5; }
918913
def V3AERd_ZMAD : SchedReadAdvance<2, [V3AEWr_ZMAD]>;
@@ -1944,7 +1939,7 @@ def : InstRW<[V3AEWrite_1c_1M], (instrs SEL_PPPP)>;
19441939
def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFALSE", "^PTRUE_[BHSD]")>;
19451940

19461941
// Predicate set/initialize, set flags
1947-
def : InstRW<[V3AEWrite_2c_2M], (instregex "^PTRUES_[BHSD]")>;
1942+
def : InstRW<[V3AEWrite_2c_1M], (instregex "^PTRUES_[BHSD]")>;
19481943

19491944
// Predicate find first/next
19501945
def : InstRW<[V3AEWrite_2c_1M], (instregex "^PFIRST_B", "^PNEXT_[BHSD]")>;

llvm/test/tools/llvm-mca/AArch64/Neoverse/V3-forwarding.s

Lines changed: 37 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1165,26 +1165,26 @@ bfmlalb z0.s, z0.h, z1.h
11651165

11661166
# CHECK: Iterations: 100
11671167
# CHECK-NEXT: Instructions: 400
1168-
# CHECK-NEXT: Total Cycles: 1403
1168+
# CHECK-NEXT: Total Cycles: 1203
11691169
# CHECK-NEXT: Total uOps: 500
11701170

11711171
# CHECK: Dispatch Width: 10
1172-
# CHECK-NEXT: uOps Per Cycle: 0.36
1173-
# CHECK-NEXT: IPC: 0.29
1172+
# CHECK-NEXT: uOps Per Cycle: 0.42
1173+
# CHECK-NEXT: IPC: 0.33
11741174
# CHECK-NEXT: Block RThroughput: 2.5
11751175

11761176
# CHECK: Timeline view:
1177-
# CHECK-NEXT: 0123456789 0
1178-
# CHECK-NEXT: Index 0123456789 0123456789
1177+
# CHECK-NEXT: 0123456789
1178+
# CHECK-NEXT: Index 0123456789 0123456
11791179

1180-
# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
1181-
# CHECK-NEXT: [0,1] D=====eeeeER . . . . cdot z0.d, z1.h, z2.h, #90
1182-
# CHECK-NEXT: [0,2] D======eeeeER . . . . cdot z0.d, z1.h, z2.h, #90
1183-
# CHECK-NEXT: [0,3] D==========eeeeER . . . cdot z0.d, z0.h, z1.h, #90
1184-
# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
1185-
# CHECK-NEXT: [1,1] D===================eeeeER . cdot z0.d, z1.h, z2.h, #90
1186-
# CHECK-NEXT: [1,2] D====================eeeeER . cdot z0.d, z1.h, z2.h, #90
1187-
# CHECK-NEXT: [1,3] D========================eeeeER cdot z0.d, z0.h, z1.h, #90
1180+
# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, z0.d, z0.d
1181+
# CHECK-NEXT: [0,1] D=====eeeER . . .. cdot z0.d, z1.h, z2.h, #90
1182+
# CHECK-NEXT: [0,2] D======eeeER . . .. cdot z0.d, z1.h, z2.h, #90
1183+
# CHECK-NEXT: [0,3] D=========eeeER. . .. cdot z0.d, z0.h, z1.h, #90
1184+
# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d
1185+
# CHECK-NEXT: [1,1] D=================eeeER .. cdot z0.d, z1.h, z2.h, #90
1186+
# CHECK-NEXT: [1,2] D==================eeeER .. cdot z0.d, z1.h, z2.h, #90
1187+
# CHECK-NEXT: [1,3] D=====================eeeER cdot z0.d, z0.h, z1.h, #90
11881188

11891189
# CHECK: Average Wait times (based on the timeline view):
11901190
# CHECK-NEXT: [0]: Executions
@@ -1193,11 +1193,11 @@ bfmlalb z0.s, z0.h, z1.h
11931193
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
11941194

11951195
# CHECK: [0] [1] [2] [3]
1196-
# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
1197-
# CHECK-NEXT: 1. 2 13.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
1198-
# CHECK-NEXT: 2. 2 14.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
1199-
# CHECK-NEXT: 3. 2 18.0 0.0 0.0 cdot z0.d, z0.h, z1.h, #90
1200-
# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
1196+
# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d
1197+
# CHECK-NEXT: 1. 2 12.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
1198+
# CHECK-NEXT: 2. 2 13.0 0.0 0.0 cdot z0.d, z1.h, z2.h, #90
1199+
# CHECK-NEXT: 3. 2 16.0 0.0 0.0 cdot z0.d, z0.h, z1.h, #90
1200+
# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
12011201

12021202
# CHECK: [23] Code Region - Z cmla.b
12031203

@@ -1355,26 +1355,26 @@ bfmlalb z0.s, z0.h, z1.h
13551355

13561356
# CHECK: Iterations: 100
13571357
# CHECK-NEXT: Instructions: 400
1358-
# CHECK-NEXT: Total Cycles: 1403
1358+
# CHECK-NEXT: Total Cycles: 1203
13591359
# CHECK-NEXT: Total uOps: 500
13601360

13611361
# CHECK: Dispatch Width: 10
1362-
# CHECK-NEXT: uOps Per Cycle: 0.36
1363-
# CHECK-NEXT: IPC: 0.29
1362+
# CHECK-NEXT: uOps Per Cycle: 0.42
1363+
# CHECK-NEXT: IPC: 0.33
13641364
# CHECK-NEXT: Block RThroughput: 2.5
13651365

13661366
# CHECK: Timeline view:
1367-
# CHECK-NEXT: 0123456789 0
1368-
# CHECK-NEXT: Index 0123456789 0123456789
1367+
# CHECK-NEXT: 0123456789
1368+
# CHECK-NEXT: Index 0123456789 0123456
13691369

1370-
# CHECK: [0,0] DeeeeeER . . . . . mul z0.d, z0.d, z0.d
1371-
# CHECK-NEXT: [0,1] D=====eeeeER . . . . sdot z0.d, z1.h, z2.h
1372-
# CHECK-NEXT: [0,2] D======eeeeER . . . . sdot z0.d, z1.h, z2.h
1373-
# CHECK-NEXT: [0,3] D==========eeeeER . . . sdot z0.d, z0.h, z1.h
1374-
# CHECK-NEXT: [1,0] D==============eeeeeER . . mul z0.d, z0.d, z0.d
1375-
# CHECK-NEXT: [1,1] D===================eeeeER . sdot z0.d, z1.h, z2.h
1376-
# CHECK-NEXT: [1,2] D====================eeeeER . sdot z0.d, z1.h, z2.h
1377-
# CHECK-NEXT: [1,3] D========================eeeeER sdot z0.d, z0.h, z1.h
1370+
# CHECK: [0,0] DeeeeeER . . . .. mul z0.d, z0.d, z0.d
1371+
# CHECK-NEXT: [0,1] D=====eeeER . . .. sdot z0.d, z1.h, z2.h
1372+
# CHECK-NEXT: [0,2] D======eeeER . . .. sdot z0.d, z1.h, z2.h
1373+
# CHECK-NEXT: [0,3] D=========eeeER. . .. sdot z0.d, z0.h, z1.h
1374+
# CHECK-NEXT: [1,0] D============eeeeeER. .. mul z0.d, z0.d, z0.d
1375+
# CHECK-NEXT: [1,1] D=================eeeER .. sdot z0.d, z1.h, z2.h
1376+
# CHECK-NEXT: [1,2] D==================eeeER .. sdot z0.d, z1.h, z2.h
1377+
# CHECK-NEXT: [1,3] D=====================eeeER sdot z0.d, z0.h, z1.h
13781378

13791379
# CHECK: Average Wait times (based on the timeline view):
13801380
# CHECK-NEXT: [0]: Executions
@@ -1383,11 +1383,11 @@ bfmlalb z0.s, z0.h, z1.h
13831383
# CHECK-NEXT: [3]: Average time elapsed from WB until retire stage
13841384

13851385
# CHECK: [0] [1] [2] [3]
1386-
# CHECK-NEXT: 0. 2 8.0 0.5 0.0 mul z0.d, z0.d, z0.d
1387-
# CHECK-NEXT: 1. 2 13.0 0.0 0.0 sdot z0.d, z1.h, z2.h
1388-
# CHECK-NEXT: 2. 2 14.0 0.0 0.0 sdot z0.d, z1.h, z2.h
1389-
# CHECK-NEXT: 3. 2 18.0 0.0 0.0 sdot z0.d, z0.h, z1.h
1390-
# CHECK-NEXT: 2 13.3 0.1 0.0 <total>
1386+
# CHECK-NEXT: 0. 2 7.0 0.5 0.0 mul z0.d, z0.d, z0.d
1387+
# CHECK-NEXT: 1. 2 12.0 0.0 0.0 sdot z0.d, z1.h, z2.h
1388+
# CHECK-NEXT: 2. 2 13.0 0.0 0.0 sdot z0.d, z1.h, z2.h
1389+
# CHECK-NEXT: 3. 2 16.0 0.0 0.0 sdot z0.d, z0.h, z1.h
1390+
# CHECK-NEXT: 2 12.0 0.1 0.0 <total>
13911391

13921392
# CHECK: [28] Code Region - Z smmla
13931393

@@ -1437,7 +1437,7 @@ bfmlalb z0.s, z0.h, z1.h
14371437
# CHECK: Dispatch Width: 10
14381438
# CHECK-NEXT: uOps Per Cycle: 0.36
14391439
# CHECK-NEXT: IPC: 0.29
1440-
# CHECK-NEXT: Block RThroughput: 4.0
1440+
# CHECK-NEXT: Block RThroughput: 2.5
14411441

14421442
# CHECK: Timeline view:
14431443
# CHECK-NEXT: 0123456789 0

0 commit comments

Comments
 (0)