From f87dcca5beaaf21f8bc1f48701677f81e3c228a8 Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Thu, 25 Sep 2025 12:09:27 -0700 Subject: [PATCH] [RISCV] Update SiFive7's scheduling model with its permutation instruction optimizations Co-Authored-By: Michael Maitland --- llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 104 +++++- .../RISCV/SiFive7/vgather-vcompress.s | 314 ++++++++++++++++++ .../RISCV/SiFiveX390/vgather-vcompress.s | 268 +++++++-------- 3 files changed, 542 insertions(+), 144 deletions(-) create mode 100644 llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index 2e14ae3af957e..3f2e7dbd07a67 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -169,6 +169,64 @@ class SiFive7GetOrderedReductionCycles { int c = !mul(6, VLUpperBound); } +class isSingleDLEN { + bit c = !or(!eq(mx, "MF2"), !or(!eq(mx, "MF4"), !eq(mx, "MF8"))); +} + +class SiFive7GetCyclesVRGatherVV { + // if (hasFastGather && isSingleDLEN(mx)) + // c = 1; + // else if (hasFastGather && (log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32)) + // c = LMUL * 2 * ceil(vl * SEW / DLEN); + // else + // c = vl; + + defvar y = !logtwo(!div(sew, 8)); + defvar x = !cond( + !eq(mx, "M1") : y, + !eq(mx, "M2") : !add(y, 1), + !eq(mx, "M4") : !add(y, 2), + !eq(mx, "M8") : !add(y, 3), + // Give isSingleDLEN(mx) cases a garbage value to avoid build failures, + // even though x will go unused. + true : 1 + ); + // LMUL * 2 * ceil(vl * SEW / DLEN) = LMUL * 2 * ceil(2 * LMUL) + defvar z = !cond( + !eq(mx, "M1") : 4, + !eq(mx, "M2") : 16, + !eq(mx, "M4") : 64, + !eq(mx, "M8") : 256, + // Give isSingleDLEN(mx) cases a garbage value to avoid build failures, + // even though z will go unused. + true : 1 + ); + defvar VLUpperBound = SiFive7GetCyclesOnePerElement.c; + bit IsSingleDLEN = isSingleDLEN.c; + + int c = !cond( + !and(hasFastGather, IsSingleDLEN) : 1, + !and(hasFastGather, !le(x, !logtwo(!div(VLEN, 64)))) : z, + true: VLUpperBound + ); +} + +class SiFive7GetCyclesVCompress { + + // if (hasFastGather && isSingleDLEN(mx)) + // c = 1 + // else + // c = vl + defvar VLUpperBound = SiFive7GetCyclesOnePerElement.c; + bit IsSingleDLEN = isSingleDLEN.c; + + int c = !if(!and(hasFastGather, IsSingleDLEN), + 1, + VLUpperBound); +} + class SiFive7GetSiFiveVFNRClipCycles { int latency = !cond( !eq(mx, "MF8"): 7, @@ -259,7 +317,8 @@ multiclass SiFive7WriteResBase { + bit isFP64Throttled = false, + bit hasFastGather = false> { // Branching let Latency = 3 in { @@ -976,13 +1035,33 @@ multiclass SiFive7WriteResBase.val in { - defvar Cycles = SiFive7GetCyclesOnePerElement.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; - let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { - defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>; - defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>; - } + defvar IsSingleDLEN = isSingleDLEN.c; + + defvar GatherVVCycles = + SiFive7GetCyclesVRGatherVV.c; + // 7 + DLEN/ SEW + defvar SlowGatherLat = !add(7, !div(!div(VLEN, 2), sew)); + defvar GatherVVLat = !if(hasFastGather, + !add(3, GatherVVCycles), SlowGatherLat); + + let Latency = GatherVVLat, AcquireAtCycles = [0, 1], + ReleaseAtCycles = [1, !add(5, GatherVVCycles)] in + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>; + + // VRGatherEI16VV is not improved by fastGather. + defvar GatherEI16VVCycles = SiFive7GetCyclesOnePerElement.c; + let Latency = SlowGatherLat, AcquireAtCycles = [0, 1], + ReleaseAtCycles = [1, !add(5, GatherEI16VVCycles)] in + defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>; + + defvar CompressCycles = SiFive7GetCyclesVCompress.c; + defvar CompressLat = !if(!and(hasFastGather, IsSingleDLEN), + 4, + !add(7, CompressCycles)); // 7 + VL + let Latency = CompressLat, AcquireAtCycles = [0, 1], + ReleaseAtCycles = [1, !add(8, CompressCycles)] in + defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>; } } @@ -1408,7 +1487,8 @@ multiclass SiFive7ReadAdvance { /// eventually be supplied by different SchedMachineModels. multiclass SiFive7SchedResources { + bit isFP64Throttled, + bit hasFastGather> { defm SiFive7 : SiFive7ProcResources; // Pull out defs from SiFive7ProcResources so we can refer to them by name. @@ -1435,7 +1515,8 @@ multiclass SiFive7SchedResources; + SiFive7VCQ, fpLatencies, isFP64Throttled, + hasFastGather>; //===----------------------------------------------------------------------===// // Bypass and advance @@ -1468,6 +1549,7 @@ class SiFive7SchedMachineModel : SchedMachineModel { SiFive7FPLatencies FPLatencies; bit IsFP64Throttled = false; + bit HasFastGather = false; string Name = !subst("Model", "", !subst("SiFive7", "", NAME)); } @@ -1494,6 +1576,7 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> { let HasExtraVALU = true; let FPLatencies = SiFive7LowFPLatencies; let IsFP64Throttled = true; + let HasFastGather = true; } /// Binding models to their scheduling resources. @@ -1501,7 +1584,8 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in { let SchedModel = model in defm model.Name : SiFive7SchedResources; + model.IsFP64Throttled, + model.HasFastGather>; } // Some model name aliases. diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s new file mode 100644 index 0000000000000..4ec1683a886dc --- /dev/null +++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s @@ -0,0 +1,314 @@ +# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py +# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -instruction-tables=full -iterations=1 < %s | FileCheck %s + +# The legal (SEW, LMUL) pairs are: +# (e8, mf8) (e8, mf4) (e8, mf2) (e8, m1) (e8, m2) (e8, m4) (e8, m8) +# (e16, mf4) (e16, mf2) (e16, m1) (e16, m2) (e16, m4) (e16, m8) +# (e32, mf2) (e32, m1) (e32, m2) (e32, m4) (e32, m8) +# (e64, m1) (e64, m2) (e64, m4) (e64, m8) + +vsetvli zero, zero, e8, mf8, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e8, mf4, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e8, mf2, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e8, m1, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e8, m2, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e8, m4, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e8, m8, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e16, mf4, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e16, mf2, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e16, m1, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e16, m2, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e16, m4, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e16, m8, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e32, mf2, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e32, m1, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e32, m2, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e32, m4, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e32, m8, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e64, m1, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e64, m2, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e64, m4, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 +vsetvli zero, zero, e64, m8, tu, mu +vrgather.vv v8, v16, v24 +vrgatherei16.vv v8, v16, v24 +vcompress.vm v8, v16, v24 + +# CHECK: Resources: +# CHECK-NEXT: [0] - VLEN512SiFive7FDiv:1 +# CHECK-NEXT: [1] - VLEN512SiFive7IDiv:1 +# CHECK-NEXT: [2] - VLEN512SiFive7PipeA:1 +# CHECK-NEXT: [3] - VLEN512SiFive7PipeAB:2 VLEN512SiFive7PipeA, VLEN512SiFive7PipeB +# CHECK-NEXT: [4] - VLEN512SiFive7PipeB:1 +# CHECK-NEXT: [5] - VLEN512SiFive7VA:1 +# CHECK-NEXT: [6] - VLEN512SiFive7VCQ:1 +# CHECK-NEXT: [7] - VLEN512SiFive7VL:1 +# CHECK-NEXT: [8] - VLEN512SiFive7VS:1 + +# CHECK: Instruction Info: +# CHECK-NEXT: [1]: #uOps +# CHECK-NEXT: [2]: Latency +# CHECK-NEXT: [3]: RThroughput +# CHECK-NEXT: [4]: MayLoad +# CHECK-NEXT: [5]: MayStore +# CHECK-NEXT: [6]: HasSideEffects (U) +# CHECK-NEXT: [7]: Bypass Latency +# CHECK-NEXT: [8]: Resources ( | [] | [,