Skip to content
195 changes: 156 additions & 39 deletions llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ class Get461018Latency<string mx> {
int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
}

// Used for: FP FMA operations, complex FP ops
class Get6678Latency<string mx> {
int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c;
}

//===----------------------------------------------------------------------===//

class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
Expand Down Expand Up @@ -574,83 +579,169 @@ foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;

defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
}
}
defvar VFALULat = Get4458Latency<mx>.c;
defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c;
let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
}

foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, isF=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
// Slightly increased latency for sew == 64
defvar VFMulVLat = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c,
Get4458Latency<mx>.c);
let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
}
// VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles
let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
}

defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c;
let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
}

defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
// The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the
// ReleaseAtCycles takes one extra cycle for the vfn* variants.
// TODO: Should we split them?
// TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is this TODO still valid?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Unfortunately, yes. You can see in rvv-fp.s. For sew == 64, it should be picking 6678, but it stays in 5558 like the other condition.

I've tried to debug this some months ago, but got nowhere. The diff is small tho.

defvar VFMulAddLatency = !if(!eq(sew, 64),
Get6678Latency<mx>.c,
ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c
);
let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
}
}
}

foreach mx = SchedMxList in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;

defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
// Slightly increased ReleaseAtCycles for M8: 18
defvar VFCmpOcc = !if(!eq(mx, "M8"),
!add(ConstOneUntilMF2ThenDouble<mx>.c, 2),
ConstOneUntilMF2ThenDouble<mx>.c
);
let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in {
defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
}

defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c;
let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in {
defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
}
}

// Widening
foreach mx = SchedMxListW in {
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;

defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c;
let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
}
}
}

foreach mx = SchedMxListFW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;

defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in {
defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
}
}

foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;

defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
}

// Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
// ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
// Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
// ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
// We use the worst-case
defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17
defvar VFWALUOcc = !if(!eq(mx, "M4"),
!add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17
ConstOneUntilMF4ThenDouble<mx>.c
);
// TODO: Split .wf/.wv variants into separate scheduling classes
let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
}

let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
}

// Slightly increased latency for SEW == 32
defvar VFWMullOcc = !if(!eq(sew, 32),
GetLMULValue<[1, 1, 1, 3, 5, 9, 18], mx>.c,
ConstOneUntilMF2ThenDouble<mx>.c
);
defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c;
let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
}

// Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
defvar VFWMulAddVLat = !if(!eq(sew, 16),
ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c,
Get6678Latency<mx>.c
);
let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
}
}
}

// Narrowing
foreach mx = SchedMxListW in {
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;

defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in {
defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
}
}

foreach mx = SchedMxListFW in {
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {

defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;

defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
}
}
}

Expand All @@ -659,9 +750,35 @@ foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;

defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
// Compute ReleaseAtCycles based on SEW
// Latency for vfdiv.vf: e16/e32 = 12/24/48/96; e64 = 18/36/72/144
// Latency for vfrdiv.vf: e16/e32 = 12/24/48/96; e64 = 40/80/160/320
// We use the worst-case, vfdiv.vf is penalized in e64
// TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes
defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12);
defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivFFactor);
let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
}

defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40);
defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivVFactor);
let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
}
}
}

// Pattern for vfsqrt.v: e16 = 18/36/72/144; e32 = 38/76/152/304; e64 = 40/80/160/320
foreach mx = SchedMxListF in {
foreach sew = SchedSEWSet<mx, 1>.val in {
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;

defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40);
defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFSqrtVFactor);
let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in {
defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
}
}
}

Expand Down
Loading