Skip to content

Commit 4ec73eb

Browse files
mshockwavemichaelmaitland
authored andcommitted
[RISCV] Update SiFive7's scheduling models with their optimizations on permutation instructions (llvm#160763)
In newer SiFIve7 cores like X390, permutation instructions like vrgather.vv operates on LMUL smaller than a single DLEN could yield a constant cycle. For slightly larger data that fits in the constraint of `log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32)`, these instructions can also yield cycles that are proportional to the quadratic of LMUL, rather than being proportional to VL. Co-authored-by: Michael Maitland <[email protected]>
1 parent d70497e commit 4ec73eb

File tree

3 files changed

+542
-144
lines changed

3 files changed

+542
-144
lines changed

llvm/lib/Target/RISCV/RISCVSchedSiFive7.td

Lines changed: 94 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -169,6 +169,64 @@ class SiFive7GetOrderedReductionCycles<string mx, int sew, int VLEN> {
169169
int c = !mul(6, VLUpperBound);
170170
}
171171

172+
class isSingleDLEN<string mx> {
173+
bit c = !or(!eq(mx, "MF2"), !or(!eq(mx, "MF4"), !eq(mx, "MF8")));
174+
}
175+
176+
class SiFive7GetCyclesVRGatherVV<string mx, int sew, int VLEN,
177+
bit hasFastGather> {
178+
// if (hasFastGather && isSingleDLEN(mx))
179+
// c = 1;
180+
// else if (hasFastGather && (log2(SEW/8) + log2(LMUL) <= log2(DLEN / 32))
181+
// c = LMUL * 2 * ceil(vl * SEW / DLEN);
182+
// else
183+
// c = vl;
184+
185+
defvar y = !logtwo(!div(sew, 8));
186+
defvar x = !cond(
187+
!eq(mx, "M1") : y,
188+
!eq(mx, "M2") : !add(y, 1),
189+
!eq(mx, "M4") : !add(y, 2),
190+
!eq(mx, "M8") : !add(y, 3),
191+
// Give isSingleDLEN(mx) cases a garbage value to avoid build failures,
192+
// even though x will go unused.
193+
true : 1
194+
);
195+
// LMUL * 2 * ceil(vl * SEW / DLEN) = LMUL * 2 * ceil(2 * LMUL)
196+
defvar z = !cond(
197+
!eq(mx, "M1") : 4,
198+
!eq(mx, "M2") : 16,
199+
!eq(mx, "M4") : 64,
200+
!eq(mx, "M8") : 256,
201+
// Give isSingleDLEN(mx) cases a garbage value to avoid build failures,
202+
// even though z will go unused.
203+
true : 1
204+
);
205+
defvar VLUpperBound = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
206+
bit IsSingleDLEN = isSingleDLEN<mx>.c;
207+
208+
int c = !cond(
209+
!and(hasFastGather, IsSingleDLEN) : 1,
210+
!and(hasFastGather, !le(x, !logtwo(!div(VLEN, 64)))) : z,
211+
true: VLUpperBound
212+
);
213+
}
214+
215+
class SiFive7GetCyclesVCompress<string mx, int sew, int VLEN,
216+
bit hasFastGather> {
217+
218+
// if (hasFastGather && isSingleDLEN(mx))
219+
// c = 1
220+
// else
221+
// c = vl
222+
defvar VLUpperBound = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
223+
bit IsSingleDLEN = isSingleDLEN<mx>.c;
224+
225+
int c = !if(!and(hasFastGather, IsSingleDLEN),
226+
1,
227+
VLUpperBound);
228+
}
229+
172230
class SiFive7GetSiFiveVFNRClipCycles<string mx, int VLEN> {
173231
int latency = !cond(
174232
!eq(mx, "MF8"): 7,
@@ -259,7 +317,8 @@ multiclass SiFive7WriteResBase<int VLEN,
259317
ProcResourceKind VL, ProcResourceKind VS,
260318
ProcResourceKind VCQ,
261319
SiFive7FPLatencies fpLatencies,
262-
bit isFP64Throttled = false> {
320+
bit isFP64Throttled = false,
321+
bit hasFastGather = false> {
263322

264323
// Branching
265324
let Latency = 3 in {
@@ -976,13 +1035,33 @@ multiclass SiFive7WriteResBase<int VLEN,
9761035

9771036
foreach mx = SchedMxList in {
9781037
foreach sew = SchedSEWSet<mx>.val in {
979-
defvar Cycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
9801038
defvar IsWorstCase = SiFive7IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
981-
let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in {
982-
defm : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>;
983-
defm : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>;
984-
defm : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>;
985-
}
1039+
defvar IsSingleDLEN = isSingleDLEN<mx>.c;
1040+
1041+
defvar GatherVVCycles =
1042+
SiFive7GetCyclesVRGatherVV<mx, sew, VLEN, hasFastGather>.c;
1043+
// 7 + DLEN/ SEW
1044+
defvar SlowGatherLat = !add(7, !div(!div(VLEN, 2), sew));
1045+
defvar GatherVVLat = !if(hasFastGather,
1046+
!add(3, GatherVVCycles), SlowGatherLat);
1047+
1048+
let Latency = GatherVVLat, AcquireAtCycles = [0, 1],
1049+
ReleaseAtCycles = [1, !add(5, GatherVVCycles)] in
1050+
defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [VCQ, VA1], mx, sew, IsWorstCase>;
1051+
1052+
// VRGatherEI16VV is not improved by fastGather.
1053+
defvar GatherEI16VVCycles = SiFive7GetCyclesOnePerElement<mx, sew, VLEN>.c;
1054+
let Latency = SlowGatherLat, AcquireAtCycles = [0, 1],
1055+
ReleaseAtCycles = [1, !add(5, GatherEI16VVCycles)] in
1056+
defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [VCQ, VA1], mx, sew, IsWorstCase>;
1057+
1058+
defvar CompressCycles = SiFive7GetCyclesVCompress<mx, sew, VLEN, hasFastGather>.c;
1059+
defvar CompressLat = !if(!and(hasFastGather, IsSingleDLEN),
1060+
4,
1061+
!add(7, CompressCycles)); // 7 + VL
1062+
let Latency = CompressLat, AcquireAtCycles = [0, 1],
1063+
ReleaseAtCycles = [1, !add(8, CompressCycles)] in
1064+
defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [VCQ, VA1], mx, sew, IsWorstCase>;
9861065
}
9871066
}
9881067

@@ -1408,7 +1487,8 @@ multiclass SiFive7ReadAdvance {
14081487
/// eventually be supplied by different SchedMachineModels.
14091488
multiclass SiFive7SchedResources<int vlen, bit extraVALU,
14101489
SiFive7FPLatencies fpLatencies,
1411-
bit isFP64Throttled> {
1490+
bit isFP64Throttled,
1491+
bit hasFastGather> {
14121492
defm SiFive7 : SiFive7ProcResources<extraVALU>;
14131493

14141494
// Pull out defs from SiFive7ProcResources so we can refer to them by name.
@@ -1435,7 +1515,8 @@ multiclass SiFive7SchedResources<int vlen, bit extraVALU,
14351515
: SiFive7WriteResBase<vlen, SiFive7PipeA, SiFive7PipeB, SiFive7PipeAB,
14361516
SiFive7IDiv, SiFive7FDiv, SiFive7VA1,
14371517
SiFive7VA1OrVA2, SiFive7VL, SiFive7VS,
1438-
SiFive7VCQ, fpLatencies, isFP64Throttled>;
1518+
SiFive7VCQ, fpLatencies, isFP64Throttled,
1519+
hasFastGather>;
14391520

14401521
//===----------------------------------------------------------------------===//
14411522
// Bypass and advance
@@ -1468,6 +1549,7 @@ class SiFive7SchedMachineModel<int vlen> : SchedMachineModel {
14681549

14691550
SiFive7FPLatencies FPLatencies;
14701551
bit IsFP64Throttled = false;
1552+
bit HasFastGather = false;
14711553

14721554
string Name = !subst("Model", "", !subst("SiFive7", "", NAME));
14731555
}
@@ -1494,14 +1576,16 @@ def SiFive7VLEN1024X300Model : SiFive7SchedMachineModel<1024> {
14941576
let HasExtraVALU = true;
14951577
let FPLatencies = SiFive7LowFPLatencies;
14961578
let IsFP64Throttled = true;
1579+
let HasFastGather = true;
14971580
}
14981581

14991582
/// Binding models to their scheduling resources.
15001583
foreach model = [SiFive7VLEN512Model, SiFive7VLEN1024X300Model] in {
15011584
let SchedModel = model in
15021585
defm model.Name : SiFive7SchedResources<model.VLEN, model.HasExtraVALU,
15031586
model.FPLatencies,
1504-
model.IsFP64Throttled>;
1587+
model.IsFP64Throttled,
1588+
model.HasFastGather>;
15051589
}
15061590

15071591
// Some model name aliases.

0 commit comments

Comments
 (0)