@@ -104,6 +104,11 @@ class Get461018Latency<string mx> {
104104 int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
105105}
106106
107+ // Used for: FP FMA operations, complex FP ops
108+ class Get6678Latency<string mx> {
109+ int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c;
110+ }
111+
107112//===----------------------------------------------------------------------===//
108113
109114class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
@@ -574,83 +579,169 @@ foreach mx = SchedMxListF in {
574579 foreach sew = SchedSEWSet<mx, isF=1>.val in {
575580 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
576581
577- defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase> ;
578- defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase> ;
579- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
580- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF ", [SMX60_VFP], mx, sew, IsWorstCase>;
581- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV ", [SMX60_VFP], mx, sew, IsWorstCase>;
582- defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF ", [SMX60_VFP], mx, sew, IsWorstCase>;
583- }
584- }
582+ defvar VFALULat = Get4458Latency<mx>.c ;
583+ defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c ;
584+ let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in {
585+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV ", [SMX60_VFP], mx, sew, IsWorstCase>;
586+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF ", [SMX60_VFP], mx, sew, IsWorstCase>;
587+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV ", [SMX60_VFP], mx, sew, IsWorstCase>;
588+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
589+ }
585590
586- foreach mx = SchedMxListF in {
587- foreach sew = SchedSEWSet<mx, isF=1>.val in {
588- defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
591+ // Slightly increased latency for sew == 64
592+ defvar VFMulVLat = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c,
593+ Get4458Latency<mx>.c);
594+ let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
595+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
596+ }
597+ // VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles
598+ let Latency = VFMulVLat, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
599+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
600+ }
589601
590- defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
591- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
592- defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
593- defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
594- defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
602+ defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
603+ defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c;
604+ let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in {
605+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
606+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
607+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
608+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
609+ }
595610
596- defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
611+ // The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the
612+ // ReleaseAtCycles takes one extra cycle for the vfn* variants.
613+ // TODO: Should we split them?
614+ // TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c
615+ defvar VFMulAddLatency = !if(!eq(sew, 64),
616+ Get6678Latency<mx>.c,
617+ ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c
618+ );
619+ let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
620+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
621+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
622+ }
597623 }
598624}
599625
600626foreach mx = SchedMxList in {
601627 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
602628
603- defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
604- defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
605- defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
606- defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
607- defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
629+ // Slightly increased ReleaseAtCycles for M8: 18
630+ defvar VFCmpOcc = !if(!eq(mx, "M8"),
631+ !add(ConstOneUntilMF2ThenDouble<mx>.c, 2),
632+ ConstOneUntilMF2ThenDouble<mx>.c
633+ );
634+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in {
635+ defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
636+ defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
637+ }
608638
609- defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
639+ defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
640+ defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c;
641+ let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in {
642+ defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
643+ defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
644+ defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
645+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
646+ }
610647}
611648
612649// Widening
613650foreach mx = SchedMxListW in {
614651 foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
615652 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
616653
617- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
654+ defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
655+ defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c;
656+ let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in {
657+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
658+ }
618659 }
619660}
620661
621662foreach mx = SchedMxListFW in {
622663 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
623664
624- defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
665+ defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
666+ defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
667+ let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in {
668+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
669+ }
625670}
626671
627672foreach mx = SchedMxListFW in {
628673 foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
629674 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
630675
631- defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
632- defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
633- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
634- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
635- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
636- defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
637- defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
676+ defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
677+ defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
678+ let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in {
679+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
680+ }
681+
682+ // Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
683+ // ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
684+ // Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
685+ // ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
686+ // We use the worst-case
687+ defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17
688+ defvar VFWALUOcc = !if(!eq(mx, "M4"),
689+ !add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17
690+ ConstOneUntilMF4ThenDouble<mx>.c
691+ );
692+ // TODO: Split .wf/.wv variants into separate scheduling classes
693+ let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in {
694+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
695+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
696+ }
697+
698+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
699+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
700+ }
701+
702+ // Slightly increased latency for SEW == 32
703+ defvar VFWMullOcc = !if(!eq(sew, 32),
704+ GetLMULValue<[1, 1, 1, 3, 5, 9, 18], mx>.c,
705+ ConstOneUntilMF2ThenDouble<mx>.c
706+ );
707+ defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c;
708+ let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in {
709+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
710+ }
711+
712+ // Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
713+ defvar VFWMulAddVLat = !if(!eq(sew, 16),
714+ ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c,
715+ Get6678Latency<mx>.c
716+ );
717+ let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
718+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
719+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
720+ }
638721 }
639722}
640723
641724// Narrowing
642725foreach mx = SchedMxListW in {
643726 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
644727
645- defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
728+ defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
729+ defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
730+ let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in {
731+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
732+ }
646733}
647734
648735foreach mx = SchedMxListFW in {
649736 foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
650-
651737 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
652- defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
653- defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
738+
739+ defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
740+ defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
741+ let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in {
742+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
743+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
744+ }
654745 }
655746}
656747
@@ -659,9 +750,35 @@ foreach mx = SchedMxListF in {
659750 foreach sew = SchedSEWSet<mx, 1>.val in {
660751 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
661752
662- defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
663- defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
664- defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
753+ // Compute ReleaseAtCycles based on SEW
754+ // Latency for vfdiv.vf: e16/e32 = 12/24/48/96; e64 = 18/36/72/144
755+ // Latency for vfrdiv.vf: e16/e32 = 12/24/48/96; e64 = 40/80/160/320
756+ // We use the worst-case, vfdiv.vf is penalized in e64
757+ // TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes
758+ defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12);
759+ defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivFFactor);
760+ let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in {
761+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
762+ }
763+
764+ defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40);
765+ defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivVFactor);
766+ let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in {
767+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
768+ }
769+ }
770+ }
771+
772+ // Pattern for vfsqrt.v: e16 = 18/36/72/144; e32 = 38/76/152/304; e64 = 40/80/160/320
773+ foreach mx = SchedMxListF in {
774+ foreach sew = SchedSEWSet<mx, 1>.val in {
775+ defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
776+
777+ defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40);
778+ defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFSqrtVFactor);
779+ let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in {
780+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
781+ }
665782 }
666783}
667784
0 commit comments