Skip to content

Commit ec490b1

Browse files
authored
[AArch64] Add SchedReadAdvance to the Neoverse-N3 scheduling model (#167302)
Introduce a description of late forwarding to the Neoverse-N3 scheduling model.
1 parent f67409c commit ec490b1

File tree

3 files changed

+2211
-64
lines changed

3 files changed

+2211
-64
lines changed

llvm/lib/Target/AArch64/AArch64SchedNeoverseN3.td

Lines changed: 159 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -553,6 +553,107 @@ def N3Write_16c_16V0 : SchedWriteRes<[N3UnitV0, N3UnitV0, N3UnitV0, N3UnitV0,
553553
let NumMicroOps = 16;
554554
}
555555

556+
//===----------------------------------------------------------------------===//
557+
// Define forwarded types
558+
// NOTE: SOG, p. 19, n. 2: Accumulator forwarding is not supported for
559+
// consumers of 64 bit multiply high operations?
560+
561+
def N3Wr_FMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
562+
def N3Rd_FMA : SchedReadAdvance<2, [WriteFMul, N3Wr_FMA]>;
563+
564+
def N3Wr_VMA : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
565+
def N3Rd_VMA : SchedReadAdvance<3, [N3Wr_VMA]>;
566+
567+
def N3Wr_VMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
568+
def N3Rd_VMAL : SchedReadAdvance<3, [N3Wr_VMAL]>;
569+
570+
def N3Wr_VMAH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
571+
def N3Rd_VMAH : SchedReadAdvance<2, [N3Wr_VMAH]>;
572+
573+
def N3Wr_VMASL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
574+
def N3Rd_VMASL : SchedReadAdvance<2, [N3Wr_VMASL]>;
575+
576+
def N3Wr_ADA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
577+
def N3Rd_ADA : SchedReadAdvance<3, [N3Wr_ADA]>;
578+
579+
def N3Wr_VDOT : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
580+
def N3Rd_VDOT : SchedReadAdvance<2, [N3Wr_VDOT]>;
581+
582+
def N3Wr_VMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
583+
def N3Rd_VMMA : SchedReadAdvance<2, [N3Wr_VMMA]>;
584+
585+
def N3Wr_FCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
586+
def N3Rd_FCMA : SchedReadAdvance<2, [N3Wr_FCMA]>;
587+
588+
def N3Wr_FPM : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
589+
def N3Wr_FPMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
590+
def N3Rd_FPMA : SchedReadAdvance<2, [N3Wr_FPM, N3Wr_FPMA]>;
591+
592+
def N3Wr_FPMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
593+
def N3Rd_FPMAL : SchedReadAdvance<2, [N3Wr_FPMAL]>;
594+
595+
def N3Wr_BFD : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
596+
def N3Rd_BFD : SchedReadAdvance<2, [N3Wr_BFD]>;
597+
598+
def N3Wr_BFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
599+
def N3Rd_BFMMA : SchedReadAdvance<2, [N3Wr_BFMMA]>;
600+
601+
def N3Wr_BFMLA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
602+
def N3Rd_BFMLA : SchedReadAdvance<2, [N3Wr_BFMLA]>;
603+
604+
def N3Wr_CRC : SchedWriteRes<[N3UnitM0]> { let Latency = 2; }
605+
def N3Rd_CRC : SchedReadAdvance<1, [N3Wr_CRC]>;
606+
607+
def N3Wr_ZA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
608+
def N3Rd_ZA : SchedReadAdvance<3, [N3Wr_ZA]>;
609+
def N3Wr_ZPA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
610+
def N3Rd_ZPA : SchedReadAdvance<3, [N3Wr_ZPA]>;
611+
def N3Wr_ZSA : SchedWriteRes<[N3UnitV1]> { let Latency = 4; }
612+
def N3Rd_ZSA : SchedReadAdvance<3, [N3Wr_ZSA]>;
613+
614+
def N3Wr_ZDOTB : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
615+
def N3Rd_ZDOTB : SchedReadAdvance<2, [N3Wr_ZDOTB]>;
616+
def N3Wr_ZDOTH : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
617+
def N3Rd_ZDOTH : SchedReadAdvance<3, [N3Wr_ZDOTH]>;
618+
619+
def N3Wr_ZCMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
620+
def N3Rd_ZCMABHS : SchedReadAdvance<3, [N3Wr_ZCMABHS]>;
621+
def N3Wr_ZCMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
622+
def N3Rd_ZCMAD : SchedReadAdvance<2, [N3Wr_ZCMAD]>;
623+
624+
def N3Wr_ZMMA : SchedWriteRes<[N3UnitV]> { let Latency = 3; }
625+
def N3Rd_ZMMA : SchedReadAdvance<2, [N3Wr_ZMMA]>;
626+
627+
def N3Wr_ZMABHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
628+
def N3Rd_ZMABHS : SchedReadAdvance<3, [N3Wr_ZMABHS]>;
629+
def N3Wr_ZMAD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
630+
def N3Rd_ZMAD : SchedReadAdvance<2, [N3Wr_ZMAD]>;
631+
632+
def N3Wr_ZMAL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
633+
def N3Rd_ZMAL : SchedReadAdvance<3, [N3Wr_ZMAL]>;
634+
635+
def N3Wr_ZMASQL : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
636+
def N3Wr_ZMASQBHS : SchedWriteRes<[N3UnitV0]> { let Latency = 4; }
637+
def N3Wr_ZMASQD : SchedWriteRes<[N3UnitV0, N3UnitV0]> { let Latency = 5; }
638+
def N3Rd_ZMASQ : SchedReadAdvance<2, [N3Wr_ZMASQL, N3Wr_ZMASQBHS,
639+
N3Wr_ZMASQD]>;
640+
641+
def N3Wr_ZFCMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
642+
def N3Rd_ZFCMA : SchedReadAdvance<2, [N3Wr_ZFCMA]>;
643+
644+
def N3Wr_ZFMA : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
645+
def N3Rd_ZFMA : SchedReadAdvance<2, [N3Wr_ZFMA]>;
646+
647+
def N3Wr_ZFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
648+
def N3Rd_ZFMAL : SchedReadAdvance<2, [N3Wr_ZFMAL]>;
649+
650+
def N3Wr_ZBFDOT : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
651+
def N3Rd_ZBFDOT : SchedReadAdvance<2, [N3Wr_ZBFDOT]>;
652+
def N3Wr_ZBFMMA : SchedWriteRes<[N3UnitV]> { let Latency = 5; }
653+
def N3Rd_ZBFMMA : SchedReadAdvance<2, [N3Wr_ZBFMMA]>;
654+
def N3Wr_ZBFMAL : SchedWriteRes<[N3UnitV]> { let Latency = 4; }
655+
def N3Rd_ZBFMAL : SchedReadAdvance<2, [N3Wr_ZBFMAL]>;
656+
556657
// Miscellaneous
557658
// -----------------------------------------------------------------------------
558659

@@ -832,10 +933,11 @@ def : SchedAlias<WriteFDiv , N3Write_7c_1V0>;
832933
def : InstRW<[N3Write_12c_1V0], (instrs FDIVDrr, FSQRTDr)>;
833934

834935
// FP multiply
835-
def : SchedAlias<WriteFMul, N3Write_3c_1V>;
936+
def : WriteRes<WriteFMul, [N3UnitV]> { let Latency = 3; }
836937

837938
// FP multiply accumulate
838-
def : InstRW<[N3Write_4c_1V], (instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
939+
def : InstRW<[N3Wr_FMA, ReadDefault, ReadDefault, N3Rd_FMA],
940+
(instregex "^(FMADD|FMSUB|FNMADD|FNMSUB)[DHS]rrr$")>;
839941

840942
// FP round to integral
841943
def : InstRW<[N3Write_3c_1V0], (instregex "^FRINT([AIMNPXZ]|32X|64X|32Z|64Z)[DHS]r$")>;
@@ -969,7 +1071,7 @@ def : SchedAlias<WriteVq, N3Write_2c_1V>;
9691071
// ASIMD absolute diff accum long
9701072
// ASIMD pairwise add and accumulate long
9711073
// ASIMD shift accumulate
972-
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL?v",
1074+
def : InstRW<[N3Wr_ADA, N3Rd_ADA], (instregex "^[SU]ABAL?v",
9731075
"^[SU]ADALPv",
9741076
"^[SU]R?SRAv")>;
9751077

@@ -984,10 +1086,11 @@ def : InstRW<[N3Write_6c_2V1], (instregex "^[SU]?ADDL?Vv16i8v$")>;
9841086

9851087
// ASIMD dot product
9861088
// ASIMD dot product using signed and unsigned integers
987-
def : InstRW<[N3Write_3c_1V], (instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
1089+
def : InstRW<[N3Wr_VDOT, N3Rd_VDOT],
1090+
(instregex "^([SU]|SU|US)DOT(lane)?(v8|v16)i8$")>;
9881091

9891092
// ASIMD matrix multiply-accumulate
990-
def : InstRW<[N3Write_3c_1V], (instrs SMMLA, UMMLA, USMMLA)>;
1093+
def : InstRW<[N3Wr_VMMA, N3Rd_VMMA], (instrs SMMLA, UMMLA, USMMLA)>;
9911094

9921095
// ASIMD max/min, reduce, 4H/4S
9931096
def : InstRW<[N3Write_3c_1V1], (instregex "^[SU](MAX|MIN)Vv4i(16|32)v$")>;
@@ -1002,16 +1105,16 @@ def : InstRW<[N3Write_6c_2V1], (instregex "[SU](MAX|MIN)Vv16i8v$")>;
10021105
def : InstRW<[N3Write_4c_1V0], (instregex "^MULv", "^SQ(R)?DMULHv")>;
10031106

10041107
// ASIMD multiply accumulate
1005-
def : InstRW<[N3Write_4c_1V0], (instregex "^MLAv", "^MLSv")>;
1108+
def : InstRW<[N3Wr_VMA, N3Rd_VMA], (instregex "^MLAv", "^MLSv")>;
10061109

10071110
// ASIMD multiply accumulate high
1008-
def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
1111+
def : InstRW<[N3Wr_VMAH, N3Rd_VMAH], (instregex "^SQRDMLAHv", "^SQRDMLSHv")>;
10091112

10101113
// ASIMD multiply accumulate long
1011-
def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
1114+
def : InstRW<[N3Wr_VMAL, N3Rd_VMAL], (instregex "^[SU]MLALv", "^[SU]MLSLv")>;
10121115

10131116
// ASIMD multiply accumulate saturating long
1014-
def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMLALv", "^SQDMLSLv")>;
1117+
def : InstRW<[N3Wr_VMASL, N3Rd_VMASL], (instregex "^SQDMLALv", "^SQDMLSLv")>;
10151118

10161119
// ASIMD multiply/multiply long (8x8) polynomial, D-form
10171120
// ASIMD multiply/multiply long (8x8) polynomial, Q-form
@@ -1058,7 +1161,7 @@ def : InstRW<[N3Write_4c_1V1],
10581161
def : InstRW<[N3Write_3c_1V], (instregex "^FCADDv")>;
10591162

10601163
// ASIMD FP complex multiply add
1061-
def : InstRW<[N3Write_4c_1V], (instregex "^FCMLAv")>;
1164+
def : InstRW<[N3Wr_FCMA, N3Rd_FCMA], (instregex "^FCMLAv")>;
10621165

10631166
// ASIMD FP convert, long (F16 to F32)
10641167
def : InstRW<[N3Write_4c_2V0], (instregex "^FCVTL(v4|v8)i16")>;
@@ -1114,13 +1217,13 @@ def : InstRW<[N3Write_4c_2V], (instregex "^(FMAX|FMIN)(NM)?Vv4(i16|i32)v$")>;
11141217
def : InstRW<[N3Write_6c_3V], (instregex "^(FMAX|FMIN)(NM)?Vv8i16v$")>;
11151218

11161219
// ASIMD FP multiply
1117-
def : InstRW<[N3Write_3c_1V], (instregex "^FMULv", "^FMULXv")>;
1220+
def : InstRW<[N3Wr_FPM], (instregex "^FMULv", "^FMULXv")>;
11181221

11191222
// ASIMD FP multiply accumulate
1120-
def : InstRW<[N3Write_4c_1V], (instregex "^FMLAv", "^FMLSv")>;
1223+
def : InstRW<[N3Wr_FPMA, N3Rd_FPMA], (instregex "^FMLAv", "^FMLSv")>;
11211224

11221225
// ASIMD FP multiply accumulate long
1123-
def : InstRW<[N3Write_4c_1V], (instregex "^FMLALv", "^FMLSLv")>;
1226+
def : InstRW<[N3Wr_FPMAL, N3Rd_FPMAL], (instregex "^FMLALv", "^FMLSLv")>;
11241227

11251228
// ASIMD FP round, D-form F32 and Q-form F64
11261229
def : InstRW<[N3Write_3c_1V0],
@@ -1157,13 +1260,14 @@ def : InstRW<[N3Write_13c_2V0], (instrs FSQRTv2f64)>;
11571260
def : InstRW<[N3Write_4c_2V0], (instrs BFCVTN, BFCVTN2)>;
11581261

11591262
// ASIMD dot product
1160-
def : InstRW<[N3Write_4c_1V], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
1263+
def : InstRW<[N3Wr_BFD, N3Rd_BFD], (instrs BFDOTv4bf16, BFDOTv8bf16)>;
11611264

11621265
// ASIMD matrix multiply accumulate
1163-
def : InstRW<[N3Write_5c_1V], (instrs BFMMLA)>;
1266+
def : InstRW<[N3Wr_BFMMA, N3Rd_BFMMA], (instrs BFMMLA)>;
11641267

11651268
// ASIMD multiply accumulate long
1166-
def : InstRW<[N3Write_4c_1V], (instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
1269+
def : InstRW<[N3Wr_BFMLA, N3Rd_BFMLA],
1270+
(instrs BFMLALB, BFMLALBIdx, BFMLALT, BFMLALTIdx)>;
11671271

11681272
// Scalar convert, F32 to BF16
11691273
def : InstRW<[N3Write_3c_1V0], (instrs BFCVT)>;
@@ -1502,7 +1606,7 @@ def : InstRW<[N3Write_4c_1V0], (instrs SM4E, SM4ENCKEY)>;
15021606
// -----------------------------------------------------------------------------
15031607

15041608
// CRC checksum ops
1505-
def : InstRW<[N3Write_2c_1M0], (instregex "^CRC32")>;
1609+
def : InstRW<[N3Wr_CRC, N3Rd_CRC], (instregex "^CRC32")>;
15061610

15071611
// SVE Predicate instructions
15081612
// -----------------------------------------------------------------------------
@@ -1592,10 +1696,10 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABD_ZPmZ_[BHSD]",
15921696
"^[SU]ABD_ZPZZ_[BHSD]")>;
15931697

15941698
// Arithmetic, absolute diff accum
1595-
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
1699+
def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABA_ZZZ_[BHSD]$")>;
15961700

15971701
// Arithmetic, absolute diff accum long
1598-
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
1702+
def : InstRW<[N3Wr_ZA, N3Rd_ZA], (instregex "^[SU]ABAL[TB]_ZZZ_[HSD]$")>;
15991703

16001704
// Arithmetic, absolute diff long
16011705
def : InstRW<[N3Write_2c_1V], (instregex "^[SU]ABDL[TB]_ZZZ_[HSD]$")>;
@@ -1629,7 +1733,8 @@ def : InstRW<[N3Write_2c_1V], (instregex "^(AD|SB)CL[BT]_ZZZ_[SD]$")>;
16291733
def : InstRW<[N3Write_2c_1V], (instregex "^ADDP_ZPmZ_[BHSD]$")>;
16301734

16311735
// Arithmetic, pairwise add and accum long
1632-
def : InstRW<[N3Write_4c_1V1], (instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
1736+
def : InstRW<[N3Wr_ZPA, ReadDefault, N3Rd_ZPA],
1737+
(instregex "^[SU]ADALP_ZPmZ_[HSD]$")>;
16331738

16341739
// Arithmetic, shift
16351740
def : InstRW<[N3Write_2c_1V1],
@@ -1642,7 +1747,7 @@ def : InstRW<[N3Write_2c_1V1],
16421747
"^(ASRR|LSLR|LSRR)_ZPmZ_[BHSD]")>;
16431748

16441749
// Arithmetic, shift and accumulate
1645-
def : InstRW<[N3Write_4c_1V1],
1750+
def : InstRW<[N3Wr_ZSA, N3Rd_ZSA],
16461751
(instregex "^(SRSRA|SSRA|URSRA|USRA)_ZZI_[BHSD]$")>;
16471752

16481753
// Arithmetic, shift by immediate
@@ -1688,16 +1793,17 @@ def : InstRW<[N3Write_2c_1V],
16881793
def : InstRW<[N3Write_2c_1V], (instregex "^(SQ)?CADD_ZZI_[BHSD]$")>;
16891794

16901795
// Complex dot product 8-bit element
1691-
def : InstRW<[N3Write_3c_1V], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
1796+
def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instrs CDOT_ZZZ_S, CDOT_ZZZI_S)>;
16921797

16931798
// Complex dot product 16-bit element
1694-
def : InstRW<[N3Write_4c_1V0], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
1799+
def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instrs CDOT_ZZZ_D, CDOT_ZZZI_D)>;
16951800

16961801
// Complex multiply-add B, H, S element size
1697-
def : InstRW<[N3Write_4c_1V0], (instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
1802+
def : InstRW<[N3Wr_ZCMABHS, N3Rd_ZCMABHS],
1803+
(instregex "^CMLA_ZZZ_[BHS]$", "^CMLA_ZZZI_[HS]$")>;
16981804

16991805
// Complex multiply-add D element size
1700-
def : InstRW<[N3Write_5c_2V0], (instrs CMLA_ZZZ_D)>;
1806+
def : InstRW<[N3Wr_ZCMAD, N3Rd_ZCMAD], (instrs CMLA_ZZZ_D)>;
17011807

17021808
// Conditional extract operations, scalar form
17031809
def : InstRW<[N3Write_8c_1M0_1V], (instregex "^CLAST[AB]_RPZ_[BHSD]$")>;
@@ -1736,13 +1842,14 @@ def : InstRW<[N3Write_16c_16V0], (instregex "^[SU]DIVR?_ZPmZ_D",
17361842
"^[SU]DIV_ZPZZ_D")>;
17371843

17381844
// Dot product, 8 bit
1739-
def : InstRW<[N3Write_3c_1V], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
1845+
def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB], (instregex "^[SU]DOT_ZZZI?_BtoS$")>;
17401846

17411847
// Dot product, 8 bit, using signed and unsigned integers
1742-
def : InstRW<[N3Write_3c_1V], (instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
1848+
def : InstRW<[N3Wr_ZDOTB, N3Rd_ZDOTB],
1849+
(instrs SUDOT_ZZZI, USDOT_ZZZI, USDOT_ZZZ)>;
17431850

17441851
// Dot product, 16 bit
1745-
def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
1852+
def : InstRW<[N3Wr_ZDOTH, N3Rd_ZDOTH], (instregex "^[SU]DOT_ZZZI?_HtoD$")>;
17461853

17471854
// Duplicate, immediate and indexed form
17481855
def : InstRW<[N3Write_2c_1V], (instregex "^DUP_ZI_[BHSD]$",
@@ -1804,7 +1911,7 @@ def : InstRW<[N3Write_2c_1V], (instregex "^[SU](MAX|MIN)_ZI_[BHSD]",
18041911
def : InstRW<[N3Write_2c_1V], (instregex "^N?MATCH_PPzZZ_[BH]$")>;
18051912

18061913
// Matrix multiply-accumulate
1807-
def : InstRW<[N3Write_3c_1V], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
1914+
def : InstRW<[N3Wr_ZMMA, N3Rd_ZMMA], (instrs SMMLA_ZZZ, UMMLA_ZZZ, USMMLA_ZZZ)>;
18081915

18091916
// Move prefix
18101917
def : InstRW<[N3Write_2c_1V], (instregex "^MOVPRFX_ZP[mz]Z_[BHSD]$",
@@ -1827,20 +1934,22 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]MULL[BT]_ZZZI_[SD]$",
18271934
"^[SU]MULL[BT]_ZZZ_[HSD]$")>;
18281935

18291936
// Multiply accumulate, B, H, S element size
1830-
def : InstRW<[N3Write_4c_1V0], (instregex "^ML[AS]_ZZZI_[BHS]$",
1831-
"^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
1937+
def : InstRW<[N3Wr_ZMABHS, ReadDefault, N3Rd_ZMABHS],
1938+
(instregex "^ML[AS]_ZZZI_[BHS]$",
1939+
"^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_[BHS]")>;
18321940

18331941
// Multiply accumulate, D element size
1834-
def : InstRW<[N3Write_5c_2V0], (instregex "^ML[AS]_ZZZI_D$",
1942+
def : InstRW<[N3Wr_ZMAD, ReadDefault, N3Rd_ZMAD], (instregex "^ML[AS]_ZZZI_D$",
18351943
"^(ML[AS]|MAD|MSB)_(ZPmZZ|ZPZZZ)_D")>;
18361944

18371945
// Multiply accumulate long
1838-
def : InstRW<[N3Write_4c_1V0], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
1946+
def : InstRW<[N3Wr_ZMAL, N3Rd_ZMAL], (instregex "^[SU]ML[AS]L[BT]_ZZZ_[HSD]$",
18391947
"^[SU]ML[AS]L[BT]_ZZZI_[SD]$")>;
18401948

18411949
// Multiply accumulate saturating doubling long regular
1842-
def : InstRW<[N3Write_4c_1V0], (instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
1843-
"^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
1950+
def : InstRW<[N3Wr_ZMASQL, N3Rd_ZMASQ],
1951+
(instregex "^SQDML[AS](LB|LT|LBT)_ZZZ_[HSD]$",
1952+
"^SQDML[AS](LB|LT)_ZZZI_[SD]$")>;
18441953

18451954
// Multiply saturating doubling high, B, H, S element size
18461955
def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULH_ZZZ_[BHS]$",
@@ -1854,13 +1963,13 @@ def : InstRW<[N3Write_4c_1V0], (instregex "^SQDMULL[BT]_ZZZ_[HSD]$",
18541963
"^SQDMULL[BT]_ZZZI_[SD]$")>;
18551964

18561965
// Multiply saturating rounding doubling regular/complex accumulate, B, H, S element size
1857-
def : InstRW<[N3Write_4c_1V0], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
1966+
def : InstRW<[N3Wr_ZMASQBHS, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZ_[BHS]$",
18581967
"^SQRDCMLAH_ZZZ_[BHS]$",
18591968
"^SQRDML[AS]H_ZZZI_[HS]$",
18601969
"^SQRDCMLAH_ZZZI_[HS]$")>;
18611970

18621971
// Multiply saturating rounding doubling regular/complex accumulate, D element size
1863-
def : InstRW<[N3Write_5c_2V0], (instregex "^SQRDML[AS]H_ZZZI?_D$",
1972+
def : InstRW<[N3Wr_ZMASQD, N3Rd_ZMASQ], (instregex "^SQRDML[AS]H_ZZZI?_D$",
18641973
"^SQRDCMLAH_ZZZ_D$")>;
18651974

18661975
// Multiply saturating rounding doubling regular/complex, B, H, S element size
@@ -1948,8 +2057,9 @@ def : InstRW<[N3Write_2c_1V], (instregex "^FAC(GE|GT)_PPzZZ_[HSD]$",
19482057
def : InstRW<[N3Write_3c_1V], (instregex "^FCADD_ZPmZ_[HSD]$")>;
19492058

19502059
// Floating point complex multiply add
1951-
def : InstRW<[N3Write_4c_1V], (instregex "^FCMLA_ZPmZZ_[HSD]$",
1952-
"^FCMLA_ZZZI_[HS]$")>;
2060+
def : InstRW<[N3Wr_ZFCMA, ReadDefault, N3Rd_ZFCMA],
2061+
(instregex "^FCMLA_ZPmZZ_[HSD]")>;
2062+
def : InstRW<[N3Wr_ZFCMA, N3Rd_ZFCMA], (instregex "^FCMLA_ZZZI_[HS]")>;
19532063

19542064
// Floating point convert, long or narrow (F16 to F32 or F32 to F16)
19552065
def : InstRW<[N3Write_4c_2V0], (instregex "^FCVT_ZPmZ_(HtoS|StoH)",
@@ -2014,12 +2124,15 @@ def : InstRW<[N3Write_3c_1V], (instregex "^(FSCALE|FMULX)_ZPmZ_[HSD]",
20142124
"^FMUL_ZPZ[IZ]_[HSD]")>;
20152125

20162126
// Floating point multiply accumulate
2017-
def : InstRW<[N3Write_4c_1V], (instregex "^F(N?M(AD|SB)|N?ML[AS])_ZPmZZ_[HSD]$",
2018-
"^FN?ML[AS]_ZPZZZ_[HSD]",
2019-
"^FML[AS]_ZZZI_[HSD]$")>;
2127+
def : InstRW<[N3Wr_ZFMA, ReadDefault, N3Rd_ZFMA],
2128+
(instregex "^FN?ML[AS]_ZPmZZ_[HSD]",
2129+
"^FN?(MAD|MSB)_ZPmZZ_[HSD]")>;
2130+
def : InstRW<[N3Wr_ZFMA, N3Rd_ZFMA],
2131+
(instregex "^FML[AS]_ZZZI_[HSD]",
2132+
"^FN?ML[AS]_ZPZZZ_[HSD]")>;
20202133

20212134
// Floating point multiply add/sub accumulate long
2022-
def : InstRW<[N3Write_4c_1V], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
2135+
def : InstRW<[N3Wr_ZFMAL, N3Rd_ZFMAL], (instregex "^FML[AS]L[BT]_ZZZI?_SHH$")>;
20232136

20242137
// Floating point reciprocal estimate, F16
20252138
def : InstRW<[N3Write_6c_4V0], (instregex "^FR(ECP|SQRT)E_ZZ_H", "^FRECPX_ZPmZ_H")>;
@@ -2079,13 +2192,13 @@ def : InstRW<[N3Write_3c_1V], (instregex "^FTS(MUL|SEL)_ZZZ_[HSD]$")>;
20792192
def : InstRW<[N3Write_4c_2V0], (instrs BFCVT_ZPmZ, BFCVTNT_ZPmZ)>;
20802193

20812194
// Dot product
2082-
def : InstRW<[N3Write_4c_1V], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
2195+
def : InstRW<[N3Wr_ZBFDOT, N3Rd_ZBFDOT], (instrs BFDOT_ZZI, BFDOT_ZZZ)>;
20832196

20842197
// Matrix multiply accumulate
2085-
def : InstRW<[N3Write_5c_1V], (instrs BFMMLA_ZZZ_HtoS)>;
2198+
def : InstRW<[N3Wr_ZBFMMA, N3Rd_ZBFMMA], (instrs BFMMLA_ZZZ_HtoS)>;
20862199

20872200
// Multiply accumulate long
2088-
def : InstRW<[N3Write_4c_1V], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
2201+
def : InstRW<[N3Wr_ZBFMAL, N3Rd_ZBFMAL], (instregex "^BFMLAL[BT]_ZZZ(I)?$")>;
20892202

20902203
// SVE Load instructions
20912204
// -----------------------------------------------------------------------------

0 commit comments

Comments
 (0)