Skip to content

Conversation

@mshockwave
Copy link
Member

The biggest change is assigning vector crypto instructions to the correct processor resource.

The majority of these changed are guided by our RVV-capable llvm-exegesis.

The biggest change is assigning vector crypto instructions to the
correct processor resource.

The majority of these data are collected using our RVV-capable
llvm-exegesis.
@llvmbot
Copy link
Member

llvmbot commented Nov 7, 2024

@llvm/pr-subscribers-backend-risc-v

Author: Min-Yih Hsu (mshockwave)

Changes

The biggest change is assigning vector crypto instructions to the correct processor resource.

The majority of these changed are guided by our RVV-capable llvm-exegesis.


Patch is 211.92 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/115243.diff

9 Files Affected:

  • (modified) llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td (+461-117)
  • (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/mask.s (+129)
  • (added) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/vmv.s (+816)
  • (modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbb.s (+169-169)
  • (modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvbc.s (+23-23)
  • (modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkg.s (+27-27)
  • (modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvkned.s (+59-59)
  • (modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvknhb.s (+35-35)
  • (modified) llvm/test/tools/llvm-mca/RISCV/SiFiveP600/zvksh.s (+16-16)
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
index 0543b999fd647d..c2d93d4c0a7f0a 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td
@@ -50,6 +50,240 @@ class SiFiveP600GetCyclesSegmented<string mx, int sew, int nf> {
   int c = !mul(VLUpperBound, nf);
 }
 
+class SiFiveP600VSM3CCycles<string mx> {
+  // c = ceil(LMUL / 2)
+  int c = !cond(!eq(mx, "M2") : 1,
+                !eq(mx, "M4") : 2,
+                !eq(mx, "M8") : 4,
+                true : 1);
+}
+
+class SiFiveP600RVVMultiplier<string mx> {
+  int c = !if(!eq(mx, "M8"), 2, 1);
+}
+
+// ======================================================================
+// The latency and occupancy data in this section are primarily evaluated
+// from llvm-exegesis.
+// ======================================================================
+
+class SiFiveP600VCryptoLatency<string mx> {
+  int c = !cond(
+    !eq(mx, "M4"): 4,
+    !eq(mx, "M8"): 8,
+    true:          2
+  );
+}
+
+class SiFiveP600VFMinMaxReduction<string mx, int sew> {
+  defvar E64Lat = !cond(
+    !eq(mx, "M1") : 4,
+    !eq(mx, "M2") : 6,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 10,
+    true:           2
+  );
+
+  defvar E64Cycles = !cond(
+    !eq(mx, "M1") : 3,
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 5,
+    !eq(mx, "M8") : 6,
+    true:           2
+  );
+
+  int latency = !if(!eq(sew, 64), E64Lat, !add(E64Lat, 2));
+  int cycles = !if(!eq(sew, 64), E64Cycles, !add(E64Cycles, 1));
+}
+
+class SiFiveP600VFUnorderedReduction<string mx, int sew> {
+  defvar E64Lat = !cond(
+    !eq(mx, "M1") : 6,
+    !eq(mx, "M2") : 12,
+    !eq(mx, "M4") : 18,
+    !eq(mx, "M8") : 24,
+    true:           2
+  );
+
+  defvar E32Cycles = !cond(
+    !eq(mx, "M1") : 10,
+    !eq(mx, "M2") : 10,
+    !eq(mx, "M4") : 11,
+    !eq(mx, "M8") : 11,
+    true:           6
+  );
+
+  int latency = !if(!eq(sew, 64), E64Lat, !add(E64Lat, 4));
+  int cycles = !if(!eq(sew, 64), 6, E32Cycles);
+}
+
+class SiFiveP600VFWidenUnorderedReduction<string mx> {
+  int latency = !cond(
+    !eq(mx, "M1") : 10,
+    !eq(mx, "M2") : 18,
+    !eq(mx, "M4") : 24,
+    !eq(mx, "M8") : 30,
+    true:           6
+  );
+}
+
+class SiFiveP600VFOrderedReduction<string mx, int sew> {
+  defvar Base = !if(!eq(sew, 64), 6, 10);
+  int c = !cond(
+    !eq(mx, "M1") : Base,
+    !eq(mx, "M2") : !mul(Base, 2),
+    !eq(mx, "M4") : !mul(Base, 4),
+    !eq(mx, "M8") : !mul(Base, 8),
+    true:           6
+  );
+}
+
+class SiFiveP600VIReductionLatency<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 8,
+    !eq(mx, "M8") : 16,
+    // M1 and lower
+    true:           2
+  );
+}
+
+class SiFiveP600VIMinMaxReductionLatency<string mx, int sew> {
+  // +-----+-----+-----+-----+----+
+  // |     | E64 | E32 | E16 | E8 |
+  // +-----+-----+-----+-----+----+
+  // | MF8 |  X  |  X  |  X  |  4 |
+  // +-----+-----+-----+-----+----+
+  // | MF4 |  X  |  X  |  4  |  6 |
+  // +-----+-----+-----+-----+----+
+  // | MF2 |  X  |  4  |  6  |  8 |
+  // +-----+-----+-----+-----+----+
+  // | M1  |  4  |  6  |  8  | 10 |
+  // +-----+-----+-----+-----+----+
+  // | M2  |  6  |  8  |  10 | 12 |
+  // +-----+-----+-----+-----+----+
+  // | M4  |  8  |  10 |  12 | 14 |
+  // +-----+-----+-----+-----+----+
+  // | M8  |  10 |  12 |  14 | 16 |
+  // +-----+-----+-----+-----+----+
+  defvar BaseIndex = !cond(
+    !eq(sew, 64): 0,
+    !eq(sew, 32): 1,
+    !eq(sew, 16): 2,
+    !eq(sew, 8):  3
+  );
+
+  defvar Latencies = [4, 6, 8, 10, 12, 14, 16];
+
+  int c = !cond(
+    !eq(mx, "M1") : Latencies[BaseIndex],
+    !eq(mx, "M2") : Latencies[!add(BaseIndex, 1)],
+    !eq(mx, "M4") : Latencies[!add(BaseIndex, 2)],
+    !eq(mx, "M8") : Latencies[!add(BaseIndex, 3)],
+    // Fractional
+    !eq(mx, "MF2"): Latencies[!sub(BaseIndex, 1)],
+    !eq(mx, "MF4"): Latencies[!sub(BaseIndex, 2)],
+    !eq(mx, "MF8"): Latencies[!sub(BaseIndex, 3)],
+  );
+}
+
+class SiFiveP600VIMinMaxReductionCycles<string mx, int sew> {
+  // +-----+-----+-----+-----+----+
+  // |     | E64 | E32 | E16 | E8 |
+  // +-----+-----+-----+-----+----+
+  // | MF8 |  X  |  X  |  X  |  3 |
+  // +-----+-----+-----+-----+----+
+  // | MF4 |  X  |  X  |  3  |  5 |
+  // +-----+-----+-----+-----+----+
+  // | MF2 |  X  |  3  |  5  |  6 |
+  // +-----+-----+-----+-----+----+
+  // | M1  |  3  |  4  |  6  |  8 |
+  // +-----+-----+-----+-----+----+
+  // | M2  |  4  |  5  |  8  |  9 |
+  // +-----+-----+-----+-----+----+
+  // | M4  |  5  |  6  |  10 | 11 |
+  // +-----+-----+-----+-----+----+
+  // | M8  |  7  |  8  |  9  | 11 |
+  // +-----+-----+-----+-----+----+
+  defvar Index = !cond(
+    !eq(sew, 64): 0,
+    !eq(sew, 32): 1,
+    !eq(sew, 16): 2,
+    !eq(sew, 8):  3
+  );
+
+  defvar Cycles = [
+    [0, 0, 0,  3],
+    [0, 0, 3,  5],
+    [0, 3, 5,  6],
+    [3, 4, 6,  8],
+    [4, 5, 8,  9],
+    [5, 6, 10, 11],
+    [7, 8, 9,  11]
+  ];
+
+  int c = !cond(
+    !eq(mx, "MF8"): Cycles[0][Index],
+    !eq(mx, "MF4"): Cycles[1][Index],
+    !eq(mx, "MF2"): Cycles[2][Index],
+    !eq(mx, "M1"):  Cycles[3][Index],
+    !eq(mx, "M2"):  Cycles[4][Index],
+    !eq(mx, "M4"):  Cycles[5][Index],
+    !eq(mx, "M8"):  Cycles[6][Index],
+  );
+}
+
+class SiFiveP600VSlide1<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 3,
+    !eq(mx, "M4") : 4,
+    !eq(mx, "M8") : 8,
+    // M1 and lower
+    true:           2
+  );
+}
+
+class SiFiveP600VSlideI<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 4,
+    !eq(mx, "M4") : 6,
+    !eq(mx, "M8") : 8,
+    // M1 and lower
+    true:           2
+  );
+}
+
+class SiFiveP600VSlideXComplex<string mx, bit isUp = false> {
+  int latency = !cond(
+    !eq(mx, "M2") : 11,
+    !eq(mx, "M4") : 14,
+    !eq(mx, "M8") : 20
+  );
+
+  int cycles = !cond(
+    !eq(mx, "M2") : !if(isUp, 10, 11),
+    !eq(mx, "M4") : !if(isUp, 12, 14),
+    !eq(mx, "M8") : !if(isUp, 16, 20)
+  );
+}
+
+class SiFiveP600VPermutationComplex<string mx> {
+  int c = !cond(
+    !eq(mx, "M2") : 12,
+    !eq(mx, "M4") : 16,
+    !eq(mx, "M8") : 24
+  );
+}
+
+class SiFiveP600VSHA2MSCycles<string mx, int sew> {
+  int c = !cond(
+    !eq(mx, "M2") : !if(!eq(sew, 32), 2, 3),
+    !eq(mx, "M4") : !if(!eq(sew, 32), 4, 6),
+    !eq(mx, "M8") : !if(!eq(sew, 32), 8, 12),
+    true: 1
+  );
+}
+
 // SiFiveP600 machine model for scheduling and other instruction cost heuristics.
 def SiFiveP600Model : SchedMachineModel {
   let IssueWidth = 4;         // 4 micro-ops are dispatched per cycle.
@@ -95,6 +329,12 @@ def SiFiveP600FloatDiv    : ProcResource<1>;
 def SiFiveP600VEXQ0        : ProcResource<1>;
 def SiFiveP600VEXQ1        : ProcResource<1>;
 def SiFiveP600VectorArith  : ProcResGroup<[SiFiveP600VEXQ0, SiFiveP600VEXQ1]>;
+
+// Only VEXQ0 has mask unit.
+defvar SiFiveP600VectorMask = SiFiveP600VEXQ0;
+// Only VEXQ0 has vector crypto.
+defvar SiFiveP600VectorCrypto = SiFiveP600VEXQ0;
+
 def SiFiveP600VLD          : ProcResource<1>;
 def SiFiveP600VST          : ProcResource<1>;
 def SiFiveP600VDiv         : ProcResource<1>;
@@ -386,7 +626,7 @@ foreach LMul = [1, 2, 4, 8] in {
     def : WriteRes<!cast<SchedWrite>("WriteVLD" # LMul # "R"), [SiFiveP600VLD]>;
     def : WriteRes<!cast<SchedWrite>("WriteVST" # LMul # "R"), [SiFiveP600VST]>;
   }
-  let Latency = LMul, ReleaseAtCycles = [LMul] in {
+  let Latency = 2, ReleaseAtCycles = [LMul] in {
     def : WriteRes<!cast<SchedWrite>("WriteVMov" # LMul # "V"), [SiFiveP600VectorArith]>;
   }
 }
@@ -395,37 +635,42 @@ foreach LMul = [1, 2, 4, 8] in {
 foreach mx = SchedMxList in {
   defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
   defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
-    defm "" : LMULWriteResMX<"WriteVIALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+  let Latency = 2, ReleaseAtCycles = [LMulLat] in {
     defm "" : LMULWriteResMX<"WriteVExtV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMX", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICALUMI", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVICmpI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeX", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMergeI", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMovI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpV",   [SiFiveP600VectorMask],  mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpX",   [SiFiveP600VectorMask],  mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVICmpI",   [SiFiveP600VectorMask],  mx, IsWorstCase>;
   }
-  let Latency = 6, ReleaseAtCycles = [LMulLat] in {
-    defm "" : LMULWriteResMX<"WriteVShiftV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVShiftX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVShiftI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+  let ReleaseAtCycles = [LMulLat] in {
+    let Latency = 6 in {
+      defm "" : LMULWriteResMX<"WriteVIMulV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+    }
+
+    let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c) in {
+      defm "" : LMULWriteResMX<"WriteVIALUV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIALUX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIALUI",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUMV",  [SiFiveP600VectorMask],  mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUMX",  [SiFiveP600VectorMask],  mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVICALUMI",  [SiFiveP600VectorMask],  mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMergeV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMergeX",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMergeI",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMovX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMovI",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVShiftI",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVShiftV",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVShiftX",   [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+      defm "" : LMULWriteResMX<"WriteVIMovV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
+    }
   }
 }
 // Widening
@@ -440,7 +685,26 @@ foreach mx = SchedMxListW in {
     defm "" : LMULWriteResMX<"WriteVIWMulX",    [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SiFiveP600VectorArith], mx, IsWorstCase>;
+
+    // Special case for variants with widen operands.
+    let ReleaseAtCycles = [!mul(LMulLat, 2)] in
+    def P600WriteVIWALUWidenOp_ # mx : SchedWriteRes<[SiFiveP600VectorArith]>;
   }
+
+  defvar P600VIWALUBaseSchedRW = [!cast<SchedWrite>("P600WriteVIWALUWidenOp_" # mx),
+                                  !cast<SchedRead>("ReadVPassthru_" # mx),
+                                  !cast<SchedRead>("ReadVIALUV_" # mx),
+                                  !cast<SchedRead>("ReadVIALUV_" # mx)];
+
+  def : InstRW<P600VIWALUBaseSchedRW,
+               (instregex "^PseudoVW(ADD|SUB)[U]?_W(V|X)_" # mx # "$")>;
+  def : InstRW<P600VIWALUBaseSchedRW[0,2,3],
+               (instregex "^PseudoVW(ADD|SUB)[U]?_WV_" # mx # "_TIED$")>;
+
+  def : InstRW<!listconcat(P600VIWALUBaseSchedRW, [!cast<SchedRead>("ReadVMask")]),
+               (instregex "^PseudoVW(ADD|SUB)[U]?_W(V|X)_" # mx # "_MASK$")>;
+  def : InstRW<!listconcat(P600VIWALUBaseSchedRW[0,1,3], [!cast<SchedRead>("ReadVMask")]),
+               (instregex "^PseudoVW(ADD|SUB)[U]?_WV_" # mx # "_MASK_TIED$")>;
 }
 
 // Worst case needs 51/45/42/72 * lmul cycles for i8/16/32/64.
@@ -504,34 +768,34 @@ foreach mx = SchedMxListF in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
     }
-    let Latency = 2, ReleaseAtCycles = [LMulLat] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
-    let Latency = 3, ReleaseAtCycles = [LMulLat] in
-    defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
   }
 }
 foreach mx = SchedMxListF in {
   foreach sew = SchedSEWSet<mx, isF=1>.val in {
     defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
-    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxList, isF=1>.c;
-    let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+    defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+    let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c), ReleaseAtCycles = [LMulLat] in {
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV",   [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
-      defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF",   [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
     }
+    let Latency = !if(!eq(mx, "M8"), 4, 3), ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in
+    defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
   }
 }
 foreach mx = SchedMxList in {
   defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
   defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
-  let Latency = 3, ReleaseAtCycles = [LMulLat] in
+  let Latency = !if(!eq(mx, "M8"), 4, 3), ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in
   defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SiFiveP600VectorArith], mx, IsWorstCase>;
   let Latency = 2, ReleaseAtCycles = [LMulLat] in {
-    defm "" : LMULWriteResMX<"WriteVFCmpV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
-    defm "" : LMULWriteResMX<"WriteVFCmpF",  [SiFiveP600VectorArith], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCmpV",  [SiFiveP600VectorMask], mx, IsWorstCase>;
+    defm "" : LMULWriteResMX<"WriteVFCmpF",  [SiFiveP600VectorMask], mx, IsWorstCase>;
   }
-  let Latency = 1, ReleaseAtCycles = [LMulLat] in {
+  let Latency = !mul(2, SiFiveP600RVVMultiplier<mx>.c),
+      ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in {
     defm "" : LMULWriteResMX<"WriteVFClassV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVFMergeV",  [SiFiveP600VectorArith], mx, IsWorstCase>;
     defm "" : LMULWriteResMX<"WriteVFMovV",    [SiFiveP600VectorArith], mx, IsWorstCase>;
@@ -565,7 +829,31 @@ foreach mx = SchedMxListFW in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
       defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SiFiveP600VectorArith], mx, sew, IsWorstCase>;
+
+      // Special case for variants with widen operands.
+      let ReleaseAtCycles = [!mul(LMulLat, 2)] in
+      def P600WriteVFWALUWidenOp_ # mx # _E # sew : SchedWriteRes<[SiFiveP600VectorArith]>;
     }
+
+    defvar P600VFWALUBaseSchedRW = [!cast<SchedWrite>("P600WriteVFWALUWidenOp_" # mx # "_E" # sew),
+                                    !cast<SchedRead>("ReadVPassthru_" # mx # "_E" # sew),
+                                    !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)];
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "$")>;
+    def : InstRW<[P600VFWALUBaseSchedRW[0], P600VFWALUBaseSchedRW[2], !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew)],
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_TIED$")>;
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUF_" # mx # "_E" # sew)]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WFPR" # sew # "_" # mx # "_E" # sew # "$")>;
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_MASK$")>;
+    def : InstRW<[P600VFWALUBaseSchedRW[0], P600VFWALUBaseSchedRW[1], !cast<SchedRead>("ReadVFWALUV_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")],
+                 (instregex "^PseudoVFW(ADD|SUB)_WV_" # mx # "_E" # sew # "_MASK_TIED$")>;
+
+    def : InstRW<!listconcat(P600VFWALUBaseSchedRW, [!cast<SchedRead>("ReadVFWALUF_" # mx # "_E" # sew), !cast<SchedRead>("ReadVMask")]),
+                 (instregex "^PseudoVFW(ADD|SUB)_WFPR" # sew # "_" # mx # "_E" # sew # "_MASK$")>;
   }
 }
 // Narrowing
@@ -580,7 +868,7 @@ foreach mx = SchedMxListFW in {
   foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
     defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
     defvar IsWorstCase = SiFiveP600IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
-    let Latency = 3, ReleaseAtCycles = [LMulLat] in {
+    let Latency = 3, ReleaseAtCycles = [!if(!eq(LMulLat, 1), 2, LMulLat)] in {
       defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SiF...
[truncated]

@mshockwave
Copy link
Member Author

ping

Copy link
Contributor

@wangpc-pp wangpc-pp left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

LGTM.

@mshockwave mshockwave merged commit 84e95be into llvm:main Nov 12, 2024
8 of 10 checks passed
@mshockwave mshockwave deleted the patch/rvv/sifive-p600-sched-update branch November 12, 2024 23:29
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants