Skip to content

Commit d615c14

Browse files
[RISCV] Update SpacemiT-X60 vector floating-point instructions latencies (#150618)
This PR adds hardware-measured latencies for all instructions defined in Section 13 of the RVV specification: "Vector Floating-Point Instructions" to the SpacemiT-X60 scheduling model.
1 parent d541680 commit d615c14

File tree

5 files changed

+2252
-2135
lines changed

5 files changed

+2252
-2135
lines changed

llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td

Lines changed: 156 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ class Get461018Latency<string mx> {
104104
int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
105105
}
106106

107+
// Used for: FP FMA operations, complex FP ops
108+
class Get6678Latency<string mx> {
109+
int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c;
110+
}
111+
107112
//===----------------------------------------------------------------------===//
108113

109114
class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
@@ -574,83 +579,169 @@ foreach mx = SchedMxListF in {
574579
foreach sew = SchedSEWSet<mx, isF=1>.val in {
575580
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
576581

577-
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
578-
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
579-
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
580-
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
581-
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
582-
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
583-
}
584-
}
582+
defvar VFALULat = Get4458Latency<mx>.c;
583+
defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c;
584+
let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in {
585+
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
586+
defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
587+
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
588+
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
589+
}
585590

586-
foreach mx = SchedMxListF in {
587-
foreach sew = SchedSEWSet<mx, isF=1>.val in {
588-
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
591+
// Slightly increased latency for sew == 64
592+
defvar VFMulVLat = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c,
593+
Get4458Latency<mx>.c);
594+
let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
595+
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
596+
}
597+
// VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles
598+
let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
599+
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
600+
}
589601

590-
defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
591-
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
592-
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
593-
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
594-
defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
602+
defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
603+
defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c;
604+
let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in {
605+
defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
606+
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
607+
defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
608+
defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
609+
}
595610

596-
defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
611+
// The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the
612+
// ReleaseAtCycles takes one extra cycle for the vfn* variants.
613+
// TODO: Should we split them?
614+
// TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c
615+
defvar VFMulAddLatency = !if(!eq(sew, 64),
616+
Get6678Latency<mx>.c,
617+
ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c
618+
);
619+
let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
620+
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
621+
defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
622+
}
597623
}
598624
}
599625

600626
foreach mx = SchedMxList in {
601627
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
602628

603-
defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
604-
defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
605-
defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
606-
defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
607-
defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
629+
// Slightly increased ReleaseAtCycles for M8: 18
630+
defvar VFCmpOcc = !if(!eq(mx, "M8"),
631+
!add(ConstOneUntilMF2ThenDouble<mx>.c, 2),
632+
ConstOneUntilMF2ThenDouble<mx>.c
633+
);
634+
let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in {
635+
defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
636+
defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
637+
}
608638

609-
defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
639+
defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
640+
defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c;
641+
let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in {
642+
defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
643+
defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
644+
defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
645+
defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
646+
}
610647
}
611648

612649
// Widening
613650
foreach mx = SchedMxListW in {
614651
foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
615652
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
616653

617-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
654+
defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
655+
defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c;
656+
let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in {
657+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
658+
}
618659
}
619660
}
620661

621662
foreach mx = SchedMxListFW in {
622663
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
623664

624-
defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
665+
defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
666+
defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
667+
let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in {
668+
defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
669+
}
625670
}
626671

627672
foreach mx = SchedMxListFW in {
628673
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
629674
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
630675

631-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
632-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
633-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
634-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
635-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
636-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
637-
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
676+
defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
677+
defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
678+
let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in {
679+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
680+
}
681+
682+
// Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
683+
// ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
684+
// Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
685+
// ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
686+
// We use the worst-case
687+
defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17
688+
defvar VFWALUOcc = !if(!eq(mx, "M4"),
689+
!add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17
690+
ConstOneUntilMF4ThenDouble<mx>.c
691+
);
692+
// TODO: Split .wf/.wv variants into separate scheduling classes
693+
let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in {
694+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
695+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
696+
}
697+
698+
let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
699+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
700+
}
701+
702+
// Slightly increased latency for SEW == 32
703+
defvar VFWMullOcc = !if(!eq(sew, 32),
704+
GetLMULValue<[1, 1, 1, 3, 5, 9, 18], mx>.c,
705+
ConstOneUntilMF2ThenDouble<mx>.c
706+
);
707+
defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c;
708+
let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in {
709+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
710+
}
711+
712+
// Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
713+
defvar VFWMulAddVLat = !if(!eq(sew, 16),
714+
ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c,
715+
Get6678Latency<mx>.c
716+
);
717+
let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
718+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
719+
defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
720+
}
638721
}
639722
}
640723

641724
// Narrowing
642725
foreach mx = SchedMxListW in {
643726
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
644727

645-
defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
728+
defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
729+
defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
730+
let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in {
731+
defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
732+
}
646733
}
647734

648735
foreach mx = SchedMxListFW in {
649736
foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
650-
651737
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
652-
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
653-
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
738+
739+
defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
740+
defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
741+
let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in {
742+
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
743+
defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
744+
}
654745
}
655746
}
656747

@@ -659,9 +750,35 @@ foreach mx = SchedMxListF in {
659750
foreach sew = SchedSEWSet<mx, 1>.val in {
660751
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
661752

662-
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
663-
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
664-
defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
753+
// Compute ReleaseAtCycles based on SEW
754+
// Latency for vfdiv.vf: e16/e32 = 12/24/48/96; e64 = 18/36/72/144
755+
// Latency for vfrdiv.vf: e16/e32 = 12/24/48/96; e64 = 40/80/160/320
756+
// We use the worst-case, vfdiv.vf is penalized in e64
757+
// TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes
758+
defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12);
759+
defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivFFactor);
760+
let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in {
761+
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
762+
}
763+
764+
defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40);
765+
defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivVFactor);
766+
let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in {
767+
defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
768+
}
769+
}
770+
}
771+
772+
// Pattern for vfsqrt.v: e16 = 18/36/72/144; e32 = 38/76/152/304; e64 = 40/80/160/320
773+
foreach mx = SchedMxListF in {
774+
foreach sew = SchedSEWSet<mx, 1>.val in {
775+
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
776+
777+
defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40);
778+
defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFSqrtVFactor);
779+
let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in {
780+
defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
781+
}
665782
}
666783
}
667784

0 commit comments

Comments
 (0)