@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
1109
1109
}
1110
1110
def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
1111
1111
1112
- def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1113
- // TODO: All align instructions are expected to be of 4 cycle latency
1114
- let Latency = 4;
1112
+ // 128-bit VALIGN
1113
+ def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1114
+ let Latency = 2;
1115
+ let ReleaseAtCycles = [1];
1116
+ let NumMicroOps = 1;
1117
+ }
1118
+
1119
+ // 256-bit VALIGN
1120
+ def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1121
+ let Latency = 3;
1115
1122
let ReleaseAtCycles = [1];
1116
1123
let NumMicroOps = 1;
1117
1124
}
1118
- def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
1119
- VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
1120
- >;
1125
+
1126
+ // 512-bit VALIGN
1127
+ def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
1128
+ let Latency = 4;
1129
+ let ReleaseAtCycles = [2];
1130
+ let NumMicroOps = 1;
1131
+ }
1132
+
1133
+ def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
1134
+ def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
1135
+ def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
1136
+
1121
1137
defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
1122
1138
1123
1139
def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
0 commit comments