-
Notifications
You must be signed in to change notification settings - Fork 15.1k
[RISCV] Update SiFive7's scheduling models with their optimizations on permutation instructions #160763
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[RISCV] Update SiFive7's scheduling models with their optimizations on permutation instructions #160763
Conversation
|
@llvm/pr-subscribers-backend-risc-v Author: Min-Yih Hsu (mshockwave) ChangesIn newer SiFIve7 cores like X390, permutation instructions like vrgather.vv operates on LMUL smaller than a single DLEN could yield a constant cycle. For slightly larger data that fits in the constraint of Patch is 79.97 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/160763.diff 3 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 2e14ae3af957e..84b3f6497c75c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -169,6 +169,64 @@ class SiFive7GetOrderedReductionCycles<string mx, int sew, int VLEN> {
int c = !mul(6, VLUpperBound);
}
+class isSingleDLEN<string mx> {
+ bit c = !or(!eq(mx, "MF2"), !or(!eq(mx, "MF4"), !eq(mx, "MF8")));
+}
+
+class SiFive7GetCyclesVRGatherVV<string mx, int sew, int VLEN,
+ bit hasFastGather> {
+ // if (hasFastGather && isSingleDLEN(mx))
+ // c = 1;
+ // else if (hasFastGather && (log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32))
+ // c = LMUL * 2 * ceil(vl * SEW / DLEN);
+ // else
+ // c = vl;
+
+ defvar y = !logtwo(!div(sew, 8));
+ defvar x = !cond(
+ !eq(mx, "M1") : y,
+ !eq(mx, "M2") : !add(y, 1),
+ !eq(mx, "M4") : !add(y, 2),
+ !eq(mx, "M8") : !add(y, 3),
+ // Give isSingleDLEN(mx) cases a garbage value to avoid build failures,
+ // even though x will go unused.
+ true : 1
+ );
+ // LMUL * 2 * ceil(vl * SEW / DLEN) = LMUL * 2 * ceil(2 * LMUL)
+ defvar z = !cond(
+ !eq(mx, "M1") : 4,
+ !eq(mx, "M2") : 16,
+ !eq(mx, "M4") : 64,
+ !eq(mx, "M8") : 256,
+ // Give isSingleDLEN(mx) cases a garbage value to avoid build failures,
+ // even though z will go unused.
+ true : 1
+ );
+ defvar VLUpperBound = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ bit IsSingleDLEN = isSingleDLEN<mx>.c;
+
+ int c = !cond(
+ !and(hasFastGather, IsSingleDLEN) : 1,
+ !and(hasFastGather, !le(x, !logtwo(!div(VLEN, 64)))) : z,
+ true: VLUpperBound
+ );
+}
+
+class SiFive7GetCyclesVCompress<string mx, int sew, int VLEN,
+ bit hasFastGather> {
+
+ // if (hasFastGather && isSingleDLEN(mx))
+ // c = 1
+ // else
+ // c = vl
+ defvar VLUpperBound = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ bit IsSingleDLEN = isSingleDLEN<mx>.c;
+
+ int c = !if(!and(hasFastGather, IsSingleDLEN),
+ 1,
+ VLUpperBound);
+}
+
class SiFive7GetSiFiveVFNRClipCycles<string mx, int VLEN> {
int latency = !cond(
!eq(mx, "MF8"): 7,
@@ -259,7 +317,8 @@ multiclass SiFive7WriteResBase<int VLEN,
ProcResourceKind VL, ProcResourceKind VS,
ProcResourceKind VCQ,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled = false> {
+ bit isFP64Throttled = false,
+ bit hasFastGather = false> {
// Branching
let Latency = 3 in {
@@ -976,13 +1035,33 @@ multiclass SiFive7WriteResBase<int VLEN,
foreach mx = SchedMxList in {
foreach sew = SchedSEWSet<mx>.val in {
- defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
- let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
- defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>;
- defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>;
- }
+ defvar IsSingleDLEN = isSingleDLEN<mx>.c;
+
+ defvar GatherVVCycles =
+ SiFive7GetCyclesVRGatherVV<mx, sew, VLEN, hasFastGather>.c;
+ // 7 + DLEN/ SEW
+ defvar SlowGatherLat = !add(7, !div(!div(VLEN, 2), sew));
+ defvar GatherVVLat = !if(hasFastGather,
+ !add(3, GatherVVCycles), SlowGatherLat);
+
+ let Latency = GatherVVLat, AcquireAtCycles = [0, 1],
+ ReleaseAtCycles = [1, !add(5, GatherVVCycles)] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>;
+
+ // VRGatherEI16VV is not improved by fastGather or fastLargeGather.
+ defvar GatherEI16VVCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
+ let Latency = SlowGatherLat, AcquireAtCycles = [0, 1],
+ ReleaseAtCycles = [1, !add(5, GatherEI16VVCycles)] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>;
+
+ defvar CompressCycles = SiFive7GetCyclesVCompress<mx, sew, VLEN, hasFastGather>.c;
+ defvar CompressLat = !if(!and(hasFastGather, IsSingleDLEN),
+ 4,
+ !add(7, CompressCycles)); // 7 + VL
+ let Latency = CompressLat, AcquireAtCycles = [0, 1],
+ ReleaseAtCycles = [1, !add(8, CompressCycles)] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>;
}
}
@@ -1408,7 +1487,8 @@ multiclass SiFive7ReadAdvance {
/// eventually be supplied by different SchedMachineModels.
multiclass SiFive7SchedResources<int vlen, bit extraVALU,
SiFive7FPLatencies fpLatencies,
- bit isFP64Throttled> {
+ bit isFP64Throttled,
+ bit hasFastGather> {
defm SiFive7 : SiFive7ProcResources<extraVALU>;
// Pull out defs from SiFive7ProcResources so we can refer to them by name.
@@ -1435,7 +1515,8 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
: SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
- SiFive7VCQ, fpLatencies, isFP64Throttled>;
+ SiFive7VCQ, fpLatencies, isFP64Throttled,
+ hasFastGather>;
//===----------------------------------------------------------------------===//
// Bypass and advance
@@ -1468,6 +1549,7 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
SiFive7FPLatencies FPLatencies;
bit IsFP64Throttled = false;
+ bit HasFastGather = false;
string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
}
@@ -1494,6 +1576,7 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
let HasExtraVALU = true;
let FPLatencies = SiFive7LowFPLatencies;
let IsFP64Throttled = true;
+ let HasFastGather = true;
}
/// Binding models to their scheduling resources.
@@ -1501,7 +1584,8 @@ foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
let SchedModel = model in
defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
model.FPLatencies,
- model.IsFP64Throttled>;
+ model.IsFP64Throttled,
+ model.HasFastGather>;
}
// Some model name aliases.
diff --git a/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s
new file mode 100644
index 0000000000000..4ec1683a886dc
--- /dev/null
+++ b/llvm/test/tools/llvm-mca/RISCV/SiFive7/vgather-vcompress.s
@@ -0,0 +1,314 @@
+# NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py
+# RUN: llvm-mca -mtriple=riscv64 -mcpu=sifive-x280 -instruction-tables=full -iterations=1 < %s | FileCheck %s
+
+# The legal (SEW, LMUL) pairs are:
+# (e8, mf8) (e8, mf4) (e8, mf2) (e8, m1) (e8, m2) (e8, m4) (e8, m8)
+# (e16, mf4) (e16, mf2) (e16, m1) (e16, m2) (e16, m4) (e16, m8)
+# (e32, mf2) (e32, m1) (e32, m2) (e32, m4) (e32, m8)
+# (e64, m1) (e64, m2) (e64, m4) (e64, m8)
+
+vsetvli zero, zero, e8, mf8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e8, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, mf4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e16, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, mf2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e32, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m1, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m2, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m4, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+vsetvli zero, zero, e64, m8, tu, mu
+vrgather.vv v8, v16, v24
+vrgatherei16.vv v8, v16, v24
+vcompress.vm v8, v16, v24
+
+# CHECK: Resources:
+# CHECK-NEXT: [0] - VLEN512SiFive7FDiv:1
+# CHECK-NEXT: [1] - VLEN512SiFive7IDiv:1
+# CHECK-NEXT: [2] - VLEN512SiFive7PipeA:1
+# CHECK-NEXT: [3] - VLEN512SiFive7PipeAB:2 VLEN512SiFive7PipeA, VLEN512SiFive7PipeB
+# CHECK-NEXT: [4] - VLEN512SiFive7PipeB:1
+# CHECK-NEXT: [5] - VLEN512SiFive7VA:1
+# CHECK-NEXT: [6] - VLEN512SiFive7VCQ:1
+# CHECK-NEXT: [7] - VLEN512SiFive7VL:1
+# CHECK-NEXT: [8] - VLEN512SiFive7VS:1
+
+# CHECK: Instruction Info:
+# CHECK-NEXT: [1]: #uOps
+# CHECK-NEXT: [2]: Latency
+# CHECK-NEXT: [3]: RThroughput
+# CHECK-NEXT: [4]: MayLoad
+# CHECK-NEXT: [5]: MayStore
+# CHECK-NEXT: [6]: HasSideEffects (U)
+# CHECK-NEXT: [7]: Bypass Latency
+# CHECK-NEXT: [8]: Resources (<Name> | <Name>[<ReleaseAtCycle>] | <Name>[<AcquireAtCycle>,<ReleaseAtCycle])
+# CHECK-NEXT: [9]: LLVM Opcode Name
+
+# CHECK: [1] [2] [3] [4] [5] [6] [7] [8] [9] Instructions:
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e8, mf8, tu, mu
+# CHECK-NEXT: 1 39 12.00 39 VLEN512SiFive7VA[1,13],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 39 12.00 39 VLEN512SiFive7VA[1,13],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 15 15.00 15 VLEN512SiFive7VA[1,16],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e8, mf4, tu, mu
+# CHECK-NEXT: 1 39 20.00 39 VLEN512SiFive7VA[1,21],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 39 20.00 39 VLEN512SiFive7VA[1,21],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 23 23.00 23 VLEN512SiFive7VA[1,24],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e8, mf2, tu, mu
+# CHECK-NEXT: 1 39 36.00 39 VLEN512SiFive7VA[1,37],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 39 36.00 39 VLEN512SiFive7VA[1,37],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 39 39.00 39 VLEN512SiFive7VA[1,40],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e8, m1, tu, mu
+# CHECK-NEXT: 1 39 68.00 39 VLEN512SiFive7VA[1,69],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 39 68.00 39 VLEN512SiFive7VA[1,69],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 71 71.00 71 VLEN512SiFive7VA[1,72],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e8, m2, tu, mu
+# CHECK-NEXT: 1 39 132.00 39 VLEN512SiFive7VA[1,133],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 39 132.00 39 VLEN512SiFive7VA[1,133],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 135 135.00 135 VLEN512SiFive7VA[1,136],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e8, m4, tu, mu
+# CHECK-NEXT: 1 39 260.00 39 VLEN512SiFive7VA[1,261],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 39 260.00 39 VLEN512SiFive7VA[1,261],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 263 263.00 263 VLEN512SiFive7VA[1,264],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e8, m8, tu, mu
+# CHECK-NEXT: 1 39 516.00 39 VLEN512SiFive7VA[1,517],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 39 516.00 39 VLEN512SiFive7VA[1,517],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 519 519.00 519 VLEN512SiFive7VA[1,520],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e16, mf4, tu, mu
+# CHECK-NEXT: 1 23 12.00 23 VLEN512SiFive7VA[1,13],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 23 12.00 23 VLEN512SiFive7VA[1,13],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 15 15.00 15 VLEN512SiFive7VA[1,16],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e16, mf2, tu, mu
+# CHECK-NEXT: 1 23 20.00 23 VLEN512SiFive7VA[1,21],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 23 20.00 23 VLEN512SiFive7VA[1,21],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 23 23.00 23 VLEN512SiFive7VA[1,24],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e16, m1, tu, mu
+# CHECK-NEXT: 1 23 36.00 23 VLEN512SiFive7VA[1,37],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 23 36.00 23 VLEN512SiFive7VA[1,37],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 39 39.00 39 VLEN512SiFive7VA[1,40],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e16, m2, tu, mu
+# CHECK-NEXT: 1 23 68.00 23 VLEN512SiFive7VA[1,69],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 23 68.00 23 VLEN512SiFive7VA[1,69],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 71 71.00 71 VLEN512SiFive7VA[1,72],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e16, m4, tu, mu
+# CHECK-NEXT: 1 23 132.00 23 VLEN512SiFive7VA[1,133],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 23 132.00 23 VLEN512SiFive7VA[1,133],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 135 135.00 135 VLEN512SiFive7VA[1,136],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e16, m8, tu, mu
+# CHECK-NEXT: 1 23 260.00 23 VLEN512SiFive7VA[1,261],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 23 260.00 23 VLEN512SiFive7VA[1,261],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 263 263.00 263 VLEN512SiFive7VA[1,264],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive7PipeAB VSETVLI vsetvli zero, zero, e32, mf2, tu, mu
+# CHECK-NEXT: 1 15 12.00 15 VLEN512SiFive7VA[1,13],VLEN512SiFive7VCQ VRGATHER_VV vrgather.vv v8, v16, v24
+# CHECK-NEXT: 1 15 12.00 15 VLEN512SiFive7VA[1,13],VLEN512SiFive7VCQ VRGATHEREI16_VV vrgatherei16.vv v8, v16, v24
+# CHECK-NEXT: 1 15 15.00 15 VLEN512SiFive7VA[1,16],VLEN512SiFive7VCQ VCOMPRESS_VM vcompress.vm v8, v16, v24
+# CHECK-NEXT: 1 3 1.00 U 1 VLEN512SiFive7PipeA,VLEN512SiFive...
[truncated]
|
…ction optimizations Co-Authored-By: Michael Maitland <[email protected]>
ffe211c to
f87dcca
Compare
wangpc-pp
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM.
| } | ||
|
|
||
| class isSingleDLEN<string mx> { | ||
| bit c = !or(!eq(mx, "MF2"), !or(!eq(mx, "MF4"), !eq(mx, "MF8"))); |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This assumes DLen is always half of VLen. Is this true for all X-series cores?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That is correct.
|
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/116/builds/18933 Here is the relevant piece of the build log for the reference |
…n permutation instructions (llvm#160763) In newer SiFIve7 cores like X390, permutation instructions like vrgather.vv operates on LMUL smaller than a single DLEN could yield a constant cycle. For slightly larger data that fits in the constraint of `log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32)`, these instructions can also yield cycles that are proportional to the quadratic of LMUL, rather than being proportional to VL. Co-authored-by: Michael Maitland <[email protected]>
In newer SiFIve7 cores like X390, permutation instructions like vrgather.vv operates on LMUL smaller than a single DLEN could yield a constant cycle. For slightly larger data that fits in the constraint of
log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32), these instructions can also yield cycles that are proportional to the quadratic of LMUL, rather than being proportional to VL.