-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[RISCV] Update SiFive P600's scheduling model on RVV instructions #115243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Update SiFive P600's scheduling model on RVV instructions #115243
Conversation
The biggest change is assigning vector crypto instructions to the correct processor resource. The majority of these data are collected using our RVV-capable llvm-exegesis.
|
@llvm/pr-subscribers-backend-risc-v Author: Min-Yih Hsu (mshockwave) ChangesThe biggest change is assigning vector crypto instructions to the correct processor resource. The majority of these changed are guided by our RVV-capable llvm-exegesis. Patch is 211.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115243.diff 9 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 0543b999fd647d..c2d93d4c0a7f0a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -50,6 +50,240 @@ class SiFiveP600GetCyclesSegmented<string mx, int sew, int nf> {
int c = !mul(VLUpperBound, nf);
}
+class SiFiveP600VSM3CCycles<string mx> {
+ // c = ceil(LMUL / 2)
+ int c = !cond(!eq(mx, "M2") : 1,
+ !eq(mx, "M4") : 2,
+ !eq(mx, "M8") : 4,
+ true : 1);
+}
+
+class SiFiveP600RVVMultiplier<string mx> {
+ int c = !if(!eq(mx, "M8"), 2, 1);
+}
+
+// ======================================================================
+// The latency and occupancy data in this section are primarily evaluated
+// from llvm-exegesis.
+// ======================================================================
+
+class SiFiveP600VCryptoLatency<string mx> {
+ int c = !cond(
+ !eq(mx, "M4"): 4,
+ !eq(mx, "M8"): 8,
+ true: 2
+ );
+}
+
+class SiFiveP600VFMinMaxReduction<string mx, int sew> {
+ defvar E64Lat = !cond(
+ !eq(mx, "M1") : 4,
+ !eq(mx, "M2") : 6,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 10,
+ true: 2
+ );
+
+ defvar E64Cycles = !cond(
+ !eq(mx, "M1") : 3,
+ !eq(mx, "M2") : 4,
+ !eq(mx, "M4") : 5,
+ !eq(mx, "M8") : 6,
+ true: 2
+ );
+
+ int latency = !if(!eq(sew, 64), E64Lat, !add(E64Lat, 2));
+ int cycles = !if(!eq(sew, 64), E64Cycles, !add(E64Cycles, 1));
+}
+
+class SiFiveP600VFUnorderedReduction<string mx, int sew> {
+ defvar E64Lat = !cond(
+ !eq(mx, "M1") : 6,
+ !eq(mx, "M2") : 12,
+ !eq(mx, "M4") : 18,
+ !eq(mx, "M8") : 24,
+ true: 2
+ );
+
+ defvar E32Cycles = !cond(
+ !eq(mx, "M1") : 10,
+ !eq(mx, "M2") : 10,
+ !eq(mx, "M4") : 11,
+ !eq(mx, "M8") : 11,
+ true: 6
+ );
+
+ int latency = !if(!eq(sew, 64), E64Lat, !add(E64Lat, 4));
+ int cycles = !if(!eq(sew, 64), 6, E32Cycles);
+}
+
+class SiFiveP600VFWidenUnorderedReduction<string mx> {
+ int latency = !cond(
+ !eq(mx, "M1") : 10,
+ !eq(mx, "M2") : 18,
+ !eq(mx, "M4") : 24,
+ !eq(mx, "M8") : 30,
+ true: 6
+ );
+}
+
+class SiFiveP600VFOrderedReduction<string mx, int sew> {
+ defvar Base = !if(!eq(sew, 64), 6, 10);
+ int c = !cond(
+ !eq(mx, "M1") : Base,
+ !eq(mx, "M2") : !mul(Base, 2),
+ !eq(mx, "M4") : !mul(Base, 4),
+ !eq(mx, "M8") : !mul(Base, 8),
+ true: 6
+ );
+}
+
+class SiFiveP600VIReductionLatency<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 4,
+ !eq(mx, "M4") : 8,
+ !eq(mx, "M8") : 16,
+ // M1 and lower
+ true: 2
+ );
+}
+
+class SiFiveP600VIMinMaxReductionLatency<string mx, int sew> {
+ // +-----+-----+-----+-----+----+
+ // | | E64 | E32 | E16 | E8 |
+ // +-----+-----+-----+-----+----+
+ // | MF8 | X | X | X | 4 |
+ // +-----+-----+-----+-----+----+
+ // | MF4 | X | X | 4 | 6 |
+ // +-----+-----+-----+-----+----+
+ // | MF2 | X | 4 | 6 | 8 |
+ // +-----+-----+-----+-----+----+
+ // | M1 | 4 | 6 | 8 | 10 |
+ // +-----+-----+-----+-----+----+
+ // | M2 | 6 | 8 | 10 | 12 |
+ // +-----+-----+-----+-----+----+
+ // | M4 | 8 | 10 | 12 | 14 |
+ // +-----+-----+-----+-----+----+
+ // | M8 | 10 | 12 | 14 | 16 |
+ // +-----+-----+-----+-----+----+
+ defvar BaseIndex = !cond(
+ !eq(sew, 64): 0,
+ !eq(sew, 32): 1,
+ !eq(sew, 16): 2,
+ !eq(sew, 8): 3
+ );
+
+ defvar Latencies = [4, 6, 8, 10, 12, 14, 16];
+
+ int c = !cond(
+ !eq(mx, "M1") : Latencies[BaseIndex],
+ !eq(mx, "M2") : Latencies[!add(BaseIndex, 1)],
+ !eq(mx, "M4") : Latencies[!add(BaseIndex, 2)],
+ !eq(mx, "M8") : Latencies[!add(BaseIndex, 3)],
+ // Fractional
+ !eq(mx, "MF2"): Latencies[!sub(BaseIndex, 1)],
+ !eq(mx, "MF4"): Latencies[!sub(BaseIndex, 2)],
+ !eq(mx, "MF8"): Latencies[!sub(BaseIndex, 3)],
+ );
+}
+
+class SiFiveP600VIMinMaxReductionCycles<string mx, int sew> {
+ // +-----+-----+-----+-----+----+
+ // | | E64 | E32 | E16 | E8 |
+ // +-----+-----+-----+-----+----+
+ // | MF8 | X | X | X | 3 |
+ // +-----+-----+-----+-----+----+
+ // | MF4 | X | X | 3 | 5 |
+ // +-----+-----+-----+-----+----+
+ // | MF2 | X | 3 | 5 | 6 |
+ // +-----+-----+-----+-----+----+
+ // | M1 | 3 | 4 | 6 | 8 |
+ // +-----+-----+-----+-----+----+
+ // | M2 | 4 | 5 | 8 | 9 |
+ // +-----+-----+-----+-----+----+
+ // | M4 | 5 | 6 | 10 | 11 |
+ // +-----+-----+-----+-----+----+
+ // | M8 | 7 | 8 | 9 | 11 |
+ // +-----+-----+-----+-----+----+
+ defvar Index = !cond(
+ !eq(sew, 64): 0,
+ !eq(sew, 32): 1,
+ !eq(sew, 16): 2,
+ !eq(sew, 8): 3
+ );
+
+ defvar Cycles = [
+ [0, 0, 0, 3],
+ [0, 0, 3, 5],
+ [0, 3, 5, 6],
+ [3, 4, 6, 8],
+ [4, 5, 8, 9],
+ [5, 6, 10, 11],
+ [7, 8, 9, 11]
+ ];
+
+ int c = !cond(
+ !eq(mx, "MF8"): Cycles[0][Index],
+ !eq(mx, "MF4"): Cycles[1][Index],
+ !eq(mx, "MF2"): Cycles[2][Index],
+ !eq(mx, "M1"): Cycles[3][Index],
+ !eq(mx, "M2"): Cycles[4][Index],
+ !eq(mx, "M4"): Cycles[5][Index],
+ !eq(mx, "M8"): Cycles[6][Index],
+ );
+}
+
+class SiFiveP600VSlide1<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 3,
+ !eq(mx, "M4") : 4,
+ !eq(mx, "M8") : 8,
+ // M1 and lower
+ true: 2
+ );
+}
+
+class SiFiveP600VSlideI<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 4,
+ !eq(mx, "M4") : 6,
+ !eq(mx, "M8") : 8,
+ // M1 and lower
+ true: 2
+ );
+}
+
+class SiFiveP600VSlideXComplex<string mx, bit isUp = false> {
+ int latency = !cond(
+ !eq(mx, "M2") : 11,
+ !eq(mx, "M4") : 14,
+ !eq(mx, "M8") : 20
+ );
+
+ int cycles = !cond(
+ !eq(mx, "M2") : !if(isUp, 10, 11),
+ !eq(mx, "M4") : !if(isUp, 12, 14),
+ !eq(mx, "M8") : !if(isUp, 16, 20)
+ );
+}
+
+class SiFiveP600VPermutationComplex<string mx> {
+ int c = !cond(
+ !eq(mx, "M2") : 12,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "M8") : 24
+ );
+}
+
+class SiFiveP600VSHA2MSCycles<string mx, int sew> {
+ int c = !cond(
+ !eq(mx, "M2") : !if(!eq(sew, 32), 2, 3),
+ !eq(mx, "M4") : !if(!eq(sew, 32), 4, 6),
+ !eq(mx, "M8") : !if(!eq(sew, 32), 8, 12),
+ true: 1
+ );
+}
+
// SiFiveP600 machine model for scheduling and other instruction cost heuristics.
def SiFiveP600Model : SchedMachineModel {
let IssueWidth = 4; // 4 micro-ops are dispatched per cycle.
@@ -95,6 +329,12 @@ def SiFiveP600FloatDiv : ProcResource<1>;
def SiFiveP600VEXQ0 : ProcResource<1>;
def SiFiveP600VEXQ1 : ProcResource<1>;
def SiFiveP600VectorArith : ProcResGroup<[SiFiveP600VEXQ0, SiFiveP600VEXQ1]>;
+
+// Only VEXQ0 has mask unit.
+defvar SiFiveP600VectorMask = SiFiveP600VEXQ0;
+// Only VEXQ0 has vector crypto.
+defvar SiFiveP600VectorCrypto = SiFiveP600VEXQ0;
+
def SiFiveP600VLD : ProcResource<1>;
def SiFiveP600VST : ProcResource<1>;
def SiFiveP600VDiv : ProcResource<1>;
@@ -386,7 +626,7 @@ foreach LMul = [1, 2, 4, 8] in {
def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SiFiveP600VLD]>;
def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SiFiveP600VST]>;
}
- let Latency = LMul, ReleaseAtCycles = [LMul] in {
+ let Latency = 2, ReleaseAtCycles = [LMul] in {
def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SiFiveP600VectorArith]>;
}
}
@@ -395,37 +635,42 @@ foreach LMul = [1, 2, 4, 8] in {
foreach mx = SchedMxList in {
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = 1, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVIALUV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIALUI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ let Latency = 2, ReleaseAtCycles = [LMulLat] in {
defm "" : LMULWriteResMX<"WriteVExtV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUI", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICALUMI", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVICmpI", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMergeI", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMovI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [SiFiveP600VectorMask], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [SiFiveP600VectorMask], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [SiFiveP600VectorMask], mx, IsWorstCase>;
}
- let Latency = 6, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVShiftV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVShiftI", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulX", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ let ReleaseAtCycles = [LMulLat] in {
+ let Latency = 6 in {
+ defm "" : LMULWriteResMX<"WriteVIMulV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ }
+
+ let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c) in {
+ defm "" : LMULWriteResMX<"WriteVIALUV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SiFiveP600VectorMask], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SiFiveP600VectorMask], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SiFiveP600VectorMask], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ }
}
}
// Widening
@@ -440,7 +685,26 @@ foreach mx = SchedMxListW in {
defm "" : LMULWriteResMX<"WriteVIWMulX", [SiFiveP600VectorArith], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+
+ // Special case for variants with widen operands.
+ let ReleaseAtCycles = [!mul(LMulLat, 2)] in
+ def P600WriteVIWALUWidenOp_ # mx : SchedWriteRes<[SiFiveP600VectorArith]>;
}
+
+ defvar P600VIWALUBaseSchedRW = [!cast<SchedWrite>("P600WriteVIWALUWidenOp_" # mx),
+ !cast<SchedRead>("ReadVPassthru_" # mx),
+ !cast<SchedRead>("ReadVIALUV_" # mx),
+ !cast<SchedRead>("ReadVIALUV_" # mx)];
+
+ def : InstRW<P600VIWALUBaseSchedRW,
+ (instregex "^PseudoVW(ADD|SUB)[U]?_W(V|X)_" # mx # "$")>;
+ def : InstRW<P600VIWALUBaseSchedRW[0,2,3],
+ (instregex "^PseudoVW(ADD|SUB)[U]?_WV_" # mx # "_TIED$")>;
+
+ def : InstRW<!listconcat(P600VIWALUBaseSchedRW, [!cast<SchedRead>("ReadVMask")]),
+ (instregex "^PseudoVW(ADD|SUB)[U]?_W(V|X)_" # mx # "_MASK$")>;
+ def : InstRW<!listconcat(P600VIWALUBaseSchedRW[0,1,3], [!cast<SchedRead>("ReadVMask")]),
+ (instregex "^PseudoVW(ADD|SUB)[U]?_WV_" # mx # "_MASK_TIED$")>;
}
// Worst case needs 51/45/42/72 * lmul cycles for i8/16/32/64.
@@ -504,34 +768,34 @@ foreach mx = SchedMxListF in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
}
- let Latency = 2, ReleaseAtCycles = [LMulLat] in
- defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
- let Latency = 3, ReleaseAtCycles = [LMulLat] in
- defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
}
}
foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
- defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList, isF=1>.c;
- let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+ defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c), ReleaseAtCycles = [LMulLat] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
}
+ let Latency = !if(!eq(mx, "M8"), 4, 3), ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
}
}
foreach mx = SchedMxList in {
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
- let Latency = 3, ReleaseAtCycles = [LMulLat] in
+ let Latency = !if(!eq(mx, "M8"), 4, 3), ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in
defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
let Latency = 2, ReleaseAtCycles = [LMulLat] in {
- defm "" : LMULWriteResMX<"WriteVFCmpV", [SiFiveP600VectorArith], mx, IsWorstCase>;
- defm "" : LMULWriteResMX<"WriteVFCmpF", [SiFiveP600VectorArith], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [SiFiveP600VectorMask], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [SiFiveP600VectorMask], mx, IsWorstCase>;
}
- let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+ let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c),
+ ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in {
defm "" : LMULWriteResMX<"WriteVFClassV", [SiFiveP600VectorArith], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFMergeV", [SiFiveP600VectorArith], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFMovV", [SiFiveP600VectorArith], mx, IsWorstCase>;
@@ -565,7 +829,31 @@ foreach mx = SchedMxListFW in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+
+ // Special case for variants with widen operands.
+ let ReleaseAtCycles = [!mul(LMulLat, 2)] in
+ def P600WriteVFWALUWidenOp_ # mx # _E # sew : SchedWriteRes<[SiFiveP600VectorArith]>;
}
+
+ defvar P600VFWALUBaseSchedRW = [!cast<SchedWrite>("P600WriteVFWALUWidenOp_" # mx # "_E" # sew),
+ !cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew),
+ !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)];
+
+ def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)]),
+ (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "$")>;
+ def : InstRW<[P600VFWALUBaseSchedRW[0], P600VFWALUBaseSchedRW[2], !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)],
+ (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_TIED$")>;
+
+ def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUF_" # mx # "_E" # sew)]),
+ (instregex "^PseudoVFW(ADD|SUB)_WFPR" # sew # "_" # mx # "_E" # sew # "$")>;
+
+ def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")]),
+ (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_MASK$")>;
+ def : InstRW<[P600VFWALUBaseSchedRW[0], P600VFWALUBaseSchedRW[1], !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")],
+ (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_MASK_TIED$")>;
+
+ def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUF_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")]),
+ (instregex "^PseudoVFW(ADD|SUB)_WFPR" # sew # "_" # mx # "_E" # sew # "_MASK$")>;
}
}
// Narrowing
@@ -580,7 +868,7 @@ foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
- let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+ let Latency = 3, ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SiF...
[truncated]
|
|
ping |
wangpc-pp
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
The biggest change is assigning vector crypto instructions to the correct processor resource.
The majority of these changed are guided by our RVV-capable llvm-exegesis.