@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
11091109}
11101110def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
11111111
1112- def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1113- // TODO: All align instructions are expected to be of 4 cycle latency
1114- let Latency = 4;
1112+ // 128-bit VALIGN
1113+ def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1114+ let Latency = 2;
1115+ let ReleaseAtCycles = [1];
1116+ let NumMicroOps = 1;
1117+ }
1118+
1119+ // 256-bit VALIGN
1120+ def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1121+ let Latency = 3;
11151122 let ReleaseAtCycles = [1];
11161123 let NumMicroOps = 1;
11171124}
1118- def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1119- VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1120- >;
1125+
1126+ // 512-bit VALIGN
1127+ def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1128+ let Latency = 4;
1129+ let ReleaseAtCycles = [2];
1130+ let NumMicroOps = 1;
1131+ }
1132+
1133+ def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
1134+ def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
1135+ def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
1136+
11211137defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
11221138
11231139def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
0 commit comments