Skip to content

Commit b206ab3

Browse files
Update ReleaseAtCycles from camel cdr data
Signed-off-by: Mikhail R. Gadelha <[email protected]>
1 parent 976ba6a commit b206ab3

File tree

3 files changed

+256
-215
lines changed

3 files changed

+256
-215
lines changed

llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td

Lines changed: 71 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ class Get461018Latency<string mx> {
104104
int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
105105
}
106106

107+
// Used for: FP FMA operations, complex FP ops
108+
class Get6678Latency<string mx> {
109+
int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c;
110+
}
111+
107112
//===----------------------------------------------------------------------===//
108113

109114
class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
@@ -541,21 +546,29 @@ foreach mx = SchedMxListF in {
541546
foreach sew = SchedSEWSet<mx, isF=1>.val in {
542547
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
543548

544-
let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [GetOneM1AndDouble<mx>.c] in {
549+
defvar VFALULat = Get4458Latency<mx>.c;
550+
defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c;
551+
let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in {
545552
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
546553
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
547554
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
548555
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
549556
}
550557

551558
// Slightly increased latency for sew == 64
552-
defvar VFMulLatency = !if(!eq(sew, 64), Get5558Latency<mx>.c, Get458Latency<mx>.c);
553-
let Latency = VFMulLatency, ReleaseAtCycles = [GetOneM1AndDouble<mx>.c] in {
559+
defvar VFMulVLatAndOcc = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c,
560+
Get4458Latency<mx>.c);
561+
let Latency = VFMulVLatAndOcc, ReleaseAtCycles = [VFMulVLatAndOcc] in {
554562
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
563+
}
564+
// VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles
565+
let Latency = VFMulVLatAndOcc, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
555566
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
556567
}
557568

558-
let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble<mx>.c] in {
569+
defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
570+
defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c;
571+
let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in {
559572
defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
560573
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
561574
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
@@ -565,9 +578,12 @@ foreach mx = SchedMxListF in {
565578
// The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the
566579
// ReleaseAtCycles takes one extra cycle for the vfn* variants.
567580
// TODO: Should we split them?
568-
// TODO: for some reason, the following cond is not working, and always use Get5558Latency
569-
defvar VFMulAddLatency = !if(!eq(sew, 64), Get6678Latency<mx>.c, Get5558Latency<mx>.c);
570-
let Latency = VFMulAddLatency, ReleaseAtCycles = [GetOneM1AndDouble<mx>.c] in {
581+
// TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c
582+
defvar VFMulAddLatency = !if(!eq(sew, 64),
583+
Get6678Latency<mx>.c,
584+
ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c
585+
);
586+
let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
571587
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
572588
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
573589
}
@@ -578,13 +594,18 @@ foreach mx = SchedMxList in {
578594
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
579595

580596
// Slightly increased ReleaseAtCycles for M8: 18
581-
defvar VFCmpOcc = !if(!eq(mx, "M8"), !add(GetOneMF2AndDouble<mx>.c, 2), GetOneMF2AndDouble<mx>.c);
597+
defvar VFCmpOcc = !if(!eq(mx, "M8"),
598+
!add(ConstOneUntilMF2ThenDouble<mx>.c, 2),
599+
ConstOneUntilMF2ThenDouble<mx>.c
600+
);
582601
let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in {
583602
defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
584603
defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
585604
}
586605

587-
let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble<mx>.c] in {
606+
defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
607+
defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c;
608+
let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in {
588609
defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
589610
defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
590611
defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
@@ -597,7 +618,9 @@ foreach mx = SchedMxListW in {
597618
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
598619
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
599620

600-
let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
621+
defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
622+
defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c;
623+
let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in {
601624
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
602625
}
603626
}
@@ -606,7 +629,9 @@ foreach mx = SchedMxListW in {
606629
foreach mx = SchedMxListFW in {
607630
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
608631

609-
let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
632+
defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
633+
defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
634+
let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in {
610635
defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
611636
}
612637
}
@@ -615,36 +640,48 @@ foreach mx = SchedMxListFW in {
615640
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
616641
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
617642

618-
let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
643+
defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
644+
defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
645+
let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in {
619646
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
620647
}
621648

622-
let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
623-
// Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
624-
// Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
625-
// ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
626-
// ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
627-
// TODO: Split .wf/.wv variants into separate scheduling classes
649+
// Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
650+
// ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
651+
// Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
652+
// ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
653+
// We use the worst-case
654+
defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17
655+
defvar VFWALUOcc = !if(!eq(mx, "M4"),
656+
!add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17
657+
ConstOneUntilMF4ThenDouble<mx>.c
658+
);
659+
// TODO: Split .wf/.wv variants into separate scheduling classes
660+
let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in {
628661
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
629662
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
630663
}
631664

632-
let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble<mx>.c] in {
665+
let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
633666
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
634667
}
635668

636669
// Slightly increased latency for SEW == 32
637670
defvar VFWMullOcc = !if(!eq(sew, 32),
638-
!add(GetDoublingValueFromLMUL<"M2", 2, mx>.c, 1), // 1/3/5/9
639-
GetOneMF2AndDouble<mx>.c
671+
!add(ConstValueUntilLMULThenDouble<"M2", 2, mx>.c, 1), // 1/3/5/9
672+
ConstOneUntilMF2ThenDouble<mx>.c
640673
);
641-
let Latency = Get5558Latency<mx>.c, ReleaseAtCycles = [VFWMullOcc] in {
674+
defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c;
675+
let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in {
642676
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
643677
}
644678

645-
// Pattern for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
646-
// Use existing 6,6,7,8 as close approximation
647-
let Latency = Get6678Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble<mx>.c] in {
679+
// Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
680+
defvar VFWMulAddVLat = !if(!eq(sew, 16),
681+
ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c,
682+
Get6678Latency<mx>.c
683+
);
684+
let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
648685
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
649686
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
650687
}
@@ -655,7 +692,9 @@ foreach mx = SchedMxListFW in {
655692
foreach mx = SchedMxListW in {
656693
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
657694

658-
let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
695+
defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
696+
defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
697+
let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in {
659698
defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
660699
}
661700
}
@@ -664,7 +703,9 @@ foreach mx = SchedMxListFW in {
664703
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
665704
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
666705

667-
let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
706+
defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
707+
defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
708+
let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in {
668709
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
669710
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
670711
}
@@ -682,13 +723,13 @@ foreach mx = SchedMxListF in {
682723
// We use the worst-case, vfdiv.vf is penalized in e64
683724
// TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes
684725
defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12);
685-
defvar VFDivFLatAndOcc = !mul(GetOneM1AndDouble<mx>.c, VFDivFFactor);
726+
defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivFFactor);
686727
let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in {
687728
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
688729
}
689730

690731
defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40);
691-
defvar VFDivVLatAndOcc = !mul(GetOneM1AndDouble<mx>.c, VFDivVFactor);
732+
defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivVFactor);
692733
let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in {
693734
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
694735
}
@@ -701,7 +742,7 @@ foreach mx = SchedMxListF in {
701742
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
702743

703744
defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40);
704-
defvar VFSqrtVLatAndOcc = !mul(GetOneM1AndDouble<mx>.c, VFSqrtVFactor);
745+
defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFSqrtVFactor);
705746
let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in {
706747
defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
707748
}

0 commit comments

Comments
 (0)