[RISCV] Update SiFive P600's scheduling model on RVV instructions #115243

mshockwave · 2024-11-07T00:32:36Z

The biggest change is assigning vector crypto instructions to the correct processor resource.

The majority of these changed are guided by our RVV-capable llvm-exegesis.

The biggest change is assigning vector crypto instructions to the correct processor resource. The majority of these data are collected using our RVV-capable llvm-exegesis.

llvmbot · 2024-11-07T00:33:11Z

@llvm/pr-subscribers-backend-risc-v

Author: Min-Yih Hsu (mshockwave)

Changes

The biggest change is assigning vector crypto instructions to the correct processor resource.

The majority of these changed are guided by our RVV-capable llvm-exegesis.

Patch is 211.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115243.diff

9 Files Affected:

(modified) llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td (+461-117)
(added) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/mask.s (+129)
(added) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/vmv.s (+816)
(modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbb.s (+169-169)
(modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbc.s (+23-23)
(modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkg.s (+27-27)
(modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkned.s (+59-59)
(modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvknhb.s (+35-35)
(modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksh.s (+16-16)

diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 0543b999fd647d..c2d93d4c0a7f0a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -50,6 +50,240 @@ class SiFiveP600GetCyclesSegmented<string mx, int sew, int nf> {
   int c = !mul(VLUpperBound, nf);
 }
 
+class SiFiveP600VSM3CCycles<string mx> {
+  // c = ceil(LMUL / 2)
+  int c = !cond(!eq(mx, "M2") : 1,
+                !eq(mx, "M4") : 2,
+                !eq(mx, "M8") : 4,
+                true : 1);
+}
+
+class SiFiveP600RVVMultiplier<string mx> {
+  int c = !if(!eq(mx, "M8"), 2, 1);
+}
+
+// ======================================================================
+// The latency and occupancy data in this section are primarily evaluated
+// from llvm-exegesis.
+// ======================================================================
+
+class SiFiveP600VCryptoLatency<string mx> {
+  int c = !cond(
+    !eq(mx, "M4"): 4,
+    !eq(mx, "M8"): 8,
+    true:          2
+  );
+}
+
+class SiFiveP600VFMinMaxReduction<string mx, int sew> {
+  defvar E64Lat = !cond(
+    !eq(mx, "M1") : 4,
+    !eq(mx, "M2") : 6,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 10,
+    true:           2
+  );
+
+  defvar E64Cycles = !cond(
+    !eq(mx, "M1") : 3,
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 5,
+    !eq(mx, "M8") : 6,
+    true:           2
+  );
+
+  int latency = !if(!eq(sew, 64), E64Lat, !add(E64Lat, 2));
+  int cycles = !if(!eq(sew, 64), E64Cycles, !add(E64Cycles, 1));
+}
+
+class SiFiveP600VFUnorderedReduction<string mx, int sew> {
+  defvar E64Lat = !cond(
+    !eq(mx, "M1") : 6,
+    !eq(mx, "M2") : 12,
+    !eq(mx, "M4") : 18,
+    !eq(mx, "M8") : 24,
+    true:           2
+  );
+
+  defvar E32Cycles = !cond(
+    !eq(mx, "M1") : 10,
+    !eq(mx, "M2") : 10,
+    !eq(mx, "M4") : 11,
+    !eq(mx, "M8") : 11,
+    true:           6
+  );
+
+  int latency = !if(!eq(sew, 64), E64Lat, !add(E64Lat, 4));
+  int cycles = !if(!eq(sew, 64), 6, E32Cycles);
+}
+
+class SiFiveP600VFWidenUnorderedReduction<string mx> {
+  int latency = !cond(
+    !eq(mx, "M1") : 10,
+    !eq(mx, "M2") : 18,
+    !eq(mx, "M4") : 24,
+    !eq(mx, "M8") : 30,
+    true:           6
+  );
+}
+
+class SiFiveP600VFOrderedReduction<string mx, int sew> {
+  defvar Base = !if(!eq(sew, 64), 6, 10);
+  int c = !cond(
+    !eq(mx, "M1") : Base,
+    !eq(mx, "M2") : !mul(Base, 2),
+    !eq(mx, "M4") : !mul(Base, 4),
+    !eq(mx, "M8") : !mul(Base, 8),
+    true:           6
+  );
+}
+
+class SiFiveP600VIReductionLatency<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 16,
+    // M1 and lower
+    true:           2
+  );
+}
+
+class SiFiveP600VIMinMaxReductionLatency<string mx, int sew> {
+  // +-----+-----+-----+-----+----+
+  // |     | E64 | E32 | E16 | E8 |
+  // +-----+-----+-----+-----+----+
+  // | MF8 |  X  |  X  |  X  |  4 |
+  // +-----+-----+-----+-----+----+
+  // | MF4 |  X  |  X  |  4  |  6 |
+  // +-----+-----+-----+-----+----+
+  // | MF2 |  X  |  4  |  6  |  8 |
+  // +-----+-----+-----+-----+----+
+  // | M1  |  4  |  6  |  8  | 10 |
+  // +-----+-----+-----+-----+----+
+  // | M2  |  6  |  8  |  10 | 12 |
+  // +-----+-----+-----+-----+----+
+  // | M4  |  8  |  10 |  12 | 14 |
+  // +-----+-----+-----+-----+----+
+  // | M8  |  10 |  12 |  14 | 16 |
+  // +-----+-----+-----+-----+----+
+  defvar BaseIndex = !cond(
+    !eq(sew, 64): 0,
+    !eq(sew, 32): 1,
+    !eq(sew, 16): 2,
+    !eq(sew, 8):  3
+  );
+
+  defvar Latencies = [4, 6, 8, 10, 12, 14, 16];
+
+  int c = !cond(
+    !eq(mx, "M1") : Latencies[BaseIndex],
+    !eq(mx, "M2") : Latencies[!add(BaseIndex, 1)],
+    !eq(mx, "M4") : Latencies[!add(BaseIndex, 2)],
+    !eq(mx, "M8") : Latencies[!add(BaseIndex, 3)],
+    // Fractional
+    !eq(mx, "MF2"): Latencies[!sub(BaseIndex, 1)],
+    !eq(mx, "MF4"): Latencies[!sub(BaseIndex, 2)],
+    !eq(mx, "MF8"): Latencies[!sub(BaseIndex, 3)],
+  );
+}
+
+class SiFiveP600VIMinMaxReductionCycles<string mx, int sew> {
+  // +-----+-----+-----+-----+----+
+  // |     | E64 | E32 | E16 | E8 |
+  // +-----+-----+-----+-----+----+
+  // | MF8 |  X  |  X  |  X  |  3 |
+  // +-----+-----+-----+-----+----+
+  // | MF4 |  X  |  X  |  3  |  5 |
+  // +-----+-----+-----+-----+----+
+  // | MF2 |  X  |  3  |  5  |  6 |
+  // +-----+-----+-----+-----+----+
+  // | M1  |  3  |  4  |  6  |  8 |
+  // +-----+-----+-----+-----+----+
+  // | M2  |  4  |  5  |  8  |  9 |
+  // +-----+-----+-----+-----+----+
+  // | M4  |  5  |  6  |  10 | 11 |
+  // +-----+-----+-----+-----+----+
+  // | M8  |  7  |  8  |  9  | 11 |
+  // +-----+-----+-----+-----+----+
+  defvar Index = !cond(
+    !eq(sew, 64): 0,
+    !eq(sew, 32): 1,
+    !eq(sew, 16): 2,
+    !eq(sew, 8):  3
+  );
+
+  defvar Cycles = [
+    [0, 0, 0,  3],
+    [0, 0, 3,  5],
+    [0, 3, 5,  6],
+    [3, 4, 6,  8],
+    [4, 5, 8,  9],
+    [5, 6, 10, 11],
+    [7, 8, 9,  11]
+  ];
+
+  int c = !cond(
+    !eq(mx, "MF8"): Cycles[0][Index],
+    !eq(mx, "MF4"): Cycles[1][Index],
+    !eq(mx, "MF2"): Cycles[2][Index],
+    !eq(mx, "M1"):  Cycles[3][Index],
+    !eq(mx, "M2"):  Cycles[4][Index],
+    !eq(mx, "M4"):  Cycles[5][Index],
+    !eq(mx, "M8"):  Cycles[6][Index],
+  );
+}
+
+class SiFiveP600VSlide1<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 3,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    // M1 and lower
+    true:           2
+  );
+}
+
+class SiFiveP600VSlideI<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 6,
+    !eq(mx, "M8") : 8,
+    // M1 and lower
+    true:           2
+  );
+}
+
+class SiFiveP600VSlideXComplex<string mx, bit isUp = false> {
+  int latency = !cond(
+    !eq(mx, "M2") : 11,
+    !eq(mx, "M4") : 14,
+    !eq(mx, "M8") : 20
+  );
+
+  int cycles = !cond(
+    !eq(mx, "M2") : !if(isUp, 10, 11),
+    !eq(mx, "M4") : !if(isUp, 12, 14),
+    !eq(mx, "M8") : !if(isUp, 16, 20)
+  );
+}
+
+class SiFiveP600VPermutationComplex<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 12,
+    !eq(mx, "M4") : 16,
+    !eq(mx, "M8") : 24
+  );
+}
+
+class SiFiveP600VSHA2MSCycles<string mx, int sew> {
+  int c = !cond(
+    !eq(mx, "M2") : !if(!eq(sew, 32), 2, 3),
+    !eq(mx, "M4") : !if(!eq(sew, 32), 4, 6),
+    !eq(mx, "M8") : !if(!eq(sew, 32), 8, 12),
+    true: 1
+  );
+}
+
 // SiFiveP600 machine model for scheduling and other instruction cost heuristics.
 def SiFiveP600Model : SchedMachineModel {
   let IssueWidth = 4;         // 4 micro-ops are dispatched per cycle.
@@ -95,6 +329,12 @@ def SiFiveP600FloatDiv    : ProcResource<1>;
 def SiFiveP600VEXQ0        : ProcResource<1>;
 def SiFiveP600VEXQ1        : ProcResource<1>;
 def SiFiveP600VectorArith  : ProcResGroup<[SiFiveP600VEXQ0, SiFiveP600VEXQ1]>;
+
+// Only VEXQ0 has mask unit.
+defvar SiFiveP600VectorMask = SiFiveP600VEXQ0;
+// Only VEXQ0 has vector crypto.
+defvar SiFiveP600VectorCrypto = SiFiveP600VEXQ0;
+
 def SiFiveP600VLD          : ProcResource<1>;
 def SiFiveP600VST          : ProcResource<1>;
 def SiFiveP600VDiv         : ProcResource<1>;
@@ -386,7 +626,7 @@ foreach LMul = [1, 2, 4, 8] in {
     def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SiFiveP600VLD]>;
     def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SiFiveP600VST]>;
   }
-  let Latency = LMul, ReleaseAtCycles = [LMul] in {
+  let Latency = 2, ReleaseAtCycles = [LMul] in {
     def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SiFiveP600VectorArith]>;
   }
 }
@@ -395,37 +635,42 @@ foreach LMul = [1, 2, 4, 8] in {
 foreach mx = SchedMxList in {
   defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
   defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
-    defm "" : LMULWriteResMX<"WriteVIALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
     defm "" : LMULWriteResMX<"WriteVExtV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMX", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMI", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeX", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeI", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpV",   [SiFiveP600VectorMask],  mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpX",   [SiFiveP600VectorMask],  mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpI",   [SiFiveP600VectorMask],  mx, IsWorstCase>;
   }
-  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
-    defm "" : LMULWriteResMX<"WriteVShiftV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVShiftX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVShiftI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  let ReleaseAtCycles = [LMulLat] in {
+    let Latency = 6 in {
+      defm "" : LMULWriteResMX<"WriteVIMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    }
+
+    let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c) in {
+      defm "" : LMULWriteResMX<"WriteVIALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIALUX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIALUI",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUMV",  [SiFiveP600VectorMask],  mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUMX",  [SiFiveP600VectorMask],  mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUMI",  [SiFiveP600VectorMask],  mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMergeV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMergeX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMergeI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMovX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMovI",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVShiftI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVShiftV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVShiftX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMovV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    }
   }
 }
 // Widening
@@ -440,7 +685,26 @@ foreach mx = SchedMxListW in {
     defm "" : LMULWriteResMX<"WriteVIWMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+
+    // Special case for variants with widen operands.
+    let ReleaseAtCycles = [!mul(LMulLat, 2)] in
+    def P600WriteVIWALUWidenOp_ # mx : SchedWriteRes<[SiFiveP600VectorArith]>;
   }
+
+  defvar P600VIWALUBaseSchedRW = [!cast<SchedWrite>("P600WriteVIWALUWidenOp_" # mx),
+                                  !cast<SchedRead>("ReadVPassthru_" # mx),
+                                  !cast<SchedRead>("ReadVIALUV_" # mx),
+                                  !cast<SchedRead>("ReadVIALUV_" # mx)];
+
+  def : InstRW<P600VIWALUBaseSchedRW,
+               (instregex "^PseudoVW(ADD|SUB)[U]?_W(V|X)_" # mx # "$")>;
+  def : InstRW<P600VIWALUBaseSchedRW[0,2,3],
+               (instregex "^PseudoVW(ADD|SUB)[U]?_WV_" # mx # "_TIED$")>;
+
+  def : InstRW<!listconcat(P600VIWALUBaseSchedRW, [!cast<SchedRead>("ReadVMask")]),
+               (instregex "^PseudoVW(ADD|SUB)[U]?_W(V|X)_" # mx # "_MASK$")>;
+  def : InstRW<!listconcat(P600VIWALUBaseSchedRW[0,1,3], [!cast<SchedRead>("ReadVMask")]),
+               (instregex "^PseudoVW(ADD|SUB)[U]?_WV_" # mx # "_MASK_TIED$")>;
 }
 
 // Worst case needs 51/45/42/72 * lmul cycles for i8/16/32/64.
@@ -504,34 +768,34 @@ foreach mx = SchedMxListF in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
     }
-    let Latency = 2, ReleaseAtCycles = [LMulLat] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
-    let Latency = 3, ReleaseAtCycles = [LMulLat] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
   }
 }
 foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, isF=1>.val in {
     defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
-    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList, isF=1>.c;
-    let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
     }
+    let Latency = !if(!eq(mx, "M8"), 4, 3), ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
   }
 }
 foreach mx = SchedMxList in {
   defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
   defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 3, ReleaseAtCycles = [LMulLat] in
+  let Latency = !if(!eq(mx, "M8"), 4, 3), ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in
   defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
   let Latency = 2, ReleaseAtCycles = [LMulLat] in {
-    defm "" : LMULWriteResMX<"WriteVFCmpV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVFCmpF",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCmpV",  [SiFiveP600VectorMask], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCmpF",  [SiFiveP600VectorMask], mx, IsWorstCase>;
   }
-  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+  let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c),
+      ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in {
     defm "" : LMULWriteResMX<"WriteVFClassV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVFMergeV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVFMovV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
@@ -565,7 +829,31 @@ foreach mx = SchedMxListFW in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+
+      // Special case for variants with widen operands.
+      let ReleaseAtCycles = [!mul(LMulLat, 2)] in
+      def P600WriteVFWALUWidenOp_ # mx # _E # sew : SchedWriteRes<[SiFiveP600VectorArith]>;
     }
+
+    defvar P600VFWALUBaseSchedRW = [!cast<SchedWrite>("P600WriteVFWALUWidenOp_" # mx # "_E" # sew),
+                                    !cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew),
+                                    !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)];
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "$")>;
+    def : InstRW<[P600VFWALUBaseSchedRW[0], P600VFWALUBaseSchedRW[2], !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)],
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_TIED$")>;
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUF_" # mx # "_E" # sew)]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WFPR" # sew # "_" # mx # "_E" # sew # "$")>;
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_MASK$")>;
+    def : InstRW<[P600VFWALUBaseSchedRW[0], P600VFWALUBaseSchedRW[1], !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")],
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_MASK_TIED$")>;
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUF_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WFPR" # sew # "_" # mx # "_E" # sew # "_MASK$")>;
   }
 }
 // Narrowing
@@ -580,7 +868,7 @@ foreach mx = SchedMxListFW in {
   foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
     defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
     defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-    let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    let Latency = 3, ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SiF...
[truncated]

mshockwave · 2024-11-10T23:47:14Z

ping

wangpc-pp

LGTM.

[RISCV] Update SiFive P600's scheduling model on RVV instructions

cb21cfd

The biggest change is assigning vector crypto instructions to the correct processor resource. The majority of these data are collected using our RVV-capable llvm-exegesis.

mshockwave requested review from dtcxzyw, michaelmaitland, topperc and wangpc-pp November 7, 2024 00:32

llvmbot added the backend:RISC-V label Nov 7, 2024

wangpc-pp approved these changes Nov 11, 2024

View reviewed changes

mshockwave merged commit 84e95be into llvm:main Nov 12, 2024
8 of 10 checks passed

mshockwave deleted the patch/rvv/sifive-p600-sched-update branch November 12, 2024 23:29

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[RISCV] Update SiFive P600's scheduling model on RVV instructions #115243

[RISCV] Update SiFive P600's scheduling model on RVV instructions #115243

Uh oh!

mshockwave commented Nov 7, 2024

Uh oh!

llvmbot commented Nov 7, 2024

Uh oh!

mshockwave commented Nov 10, 2024

Uh oh!

wangpc-pp left a comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[RISCV] Update SiFive P600's scheduling model on RVV instructions #115243

[RISCV] Update SiFive P600's scheduling model on RVV instructions #115243

Uh oh!

Conversation

mshockwave commented Nov 7, 2024

Uh oh!

llvmbot commented Nov 7, 2024

Uh oh!

mshockwave commented Nov 10, 2024

Uh oh!

wangpc-pp left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants