@@ -104,6 +104,11 @@ class Get461018Latency<string mx> {
104104 int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
105105}
106106
107+ // Used for: FP FMA operations, complex FP ops
108+ class Get6678Latency<string mx> {
109+ int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c;
110+ }
111+
107112//===----------------------------------------------------------------------===//
108113
109114class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
@@ -541,21 +546,29 @@ foreach mx = SchedMxListF in {
541546 foreach sew = SchedSEWSet<mx, isF=1>.val in {
542547 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
543548
544- let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [GetOneM1AndDouble<mx>.c] in {
549+ defvar VFALULat = Get4458Latency<mx>.c;
550+ defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c;
551+ let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in {
545552 defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
546553 defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
547554 defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>;
548555 defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>;
549556 }
550557
551558 // Slightly increased latency for sew == 64
552- defvar VFMulLatency = !if(!eq(sew, 64), Get5558Latency<mx>.c, Get458Latency<mx>.c);
553- let Latency = VFMulLatency, ReleaseAtCycles = [GetOneM1AndDouble<mx>.c] in {
559+ defvar VFMulVLatAndOcc = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c,
560+ Get4458Latency<mx>.c);
561+ let Latency = VFMulVLatAndOcc, ReleaseAtCycles = [VFMulVLatAndOcc] in {
554562 defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
563+ }
564+ // VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles
565+ let Latency = VFMulVLatAndOcc, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
555566 defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
556567 }
557568
558- let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble<mx>.c] in {
569+ defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
570+ defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c;
571+ let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in {
559572 defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>;
560573 defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>;
561574 defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>;
@@ -565,9 +578,12 @@ foreach mx = SchedMxListF in {
565578 // The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the
566579 // ReleaseAtCycles takes one extra cycle for the vfn* variants.
567580 // TODO: Should we split them?
568- // TODO: for some reason, the following cond is not working, and always use Get5558Latency
569- defvar VFMulAddLatency = !if(!eq(sew, 64), Get6678Latency<mx>.c, Get5558Latency<mx>.c);
570- let Latency = VFMulAddLatency, ReleaseAtCycles = [GetOneM1AndDouble<mx>.c] in {
581+ // TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c
582+ defvar VFMulAddLatency = !if(!eq(sew, 64),
583+ Get6678Latency<mx>.c,
584+ ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c
585+ );
586+ let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in {
571587 defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
572588 defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
573589 }
@@ -578,13 +594,18 @@ foreach mx = SchedMxList in {
578594 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
579595
580596 // Slightly increased ReleaseAtCycles for M8: 18
581- defvar VFCmpOcc = !if(!eq(mx, "M8"), !add(GetOneMF2AndDouble<mx>.c, 2), GetOneMF2AndDouble<mx>.c);
597+ defvar VFCmpOcc = !if(!eq(mx, "M8"),
598+ !add(ConstOneUntilMF2ThenDouble<mx>.c, 2),
599+ ConstOneUntilMF2ThenDouble<mx>.c
600+ );
582601 let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in {
583602 defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>;
584603 defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>;
585604 }
586605
587- let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble<mx>.c] in {
606+ defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
607+ defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c;
608+ let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in {
588609 defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>;
589610 defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>;
590611 defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>;
@@ -597,7 +618,9 @@ foreach mx = SchedMxListW in {
597618 foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
598619 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
599620
600- let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
621+ defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
622+ defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c;
623+ let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in {
601624 defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
602625 }
603626 }
@@ -606,7 +629,9 @@ foreach mx = SchedMxListW in {
606629foreach mx = SchedMxListFW in {
607630 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c;
608631
609- let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
632+ defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
633+ defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
634+ let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in {
610635 defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
611636 }
612637}
@@ -615,36 +640,48 @@ foreach mx = SchedMxListFW in {
615640 foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
616641 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
617642
618- let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
643+ defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
644+ defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
645+ let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in {
619646 defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
620647 }
621648
622- let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
623- // Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
624- // Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
625- // ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
626- // ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
627- // TODO: Split .wf/.wv variants into separate scheduling classes
649+ // Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8
650+ // ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8
651+ // Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17
652+ // ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17
653+ // We use the worst-case
654+ defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17
655+ defvar VFWALUOcc = !if(!eq(mx, "M4"),
656+ !add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17
657+ ConstOneUntilMF4ThenDouble<mx>.c
658+ );
659+ // TODO: Split .wf/.wv variants into separate scheduling classes
660+ let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in {
628661 defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>;
629662 defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>;
630663 }
631664
632- let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble <mx>.c] in {
665+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble <mx>.c] in {
633666 defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>;
634667 }
635668
636669 // Slightly increased latency for SEW == 32
637670 defvar VFWMullOcc = !if(!eq(sew, 32),
638- !add(GetDoublingValueFromLMUL <"M2", 2, mx>.c, 1), // 1/3/5/9
639- GetOneMF2AndDouble <mx>.c
671+ !add(ConstValueUntilLMULThenDouble <"M2", 2, mx>.c, 1), // 1/3/5/9
672+ ConstOneUntilMF2ThenDouble <mx>.c
640673 );
641- let Latency = Get5558Latency<mx>.c, ReleaseAtCycles = [VFWMullOcc] in {
674+ defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c;
675+ let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in {
642676 defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>;
643677 }
644678
645- // Pattern for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
646- // Use existing 6,6,7,8 as close approximation
647- let Latency = Get6678Latency<mx>.c, ReleaseAtCycles = [GetOneMF2AndDouble<mx>.c] in {
679+ // Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8
680+ defvar VFWMulAddVLat = !if(!eq(sew, 16),
681+ ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c,
682+ Get6678Latency<mx>.c
683+ );
684+ let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
648685 defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>;
649686 defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>;
650687 }
@@ -655,7 +692,9 @@ foreach mx = SchedMxListFW in {
655692foreach mx = SchedMxListW in {
656693 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
657694
658- let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
695+ defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
696+ defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
697+ let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in {
659698 defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>;
660699 }
661700}
@@ -664,7 +703,9 @@ foreach mx = SchedMxListFW in {
664703 foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
665704 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
666705
667- let Latency = !mul(GetOneM1AndDouble<mx>.c, 4), ReleaseAtCycles = [GetOneMF4AndDouble<mx>.c] in {
706+ defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
707+ defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c;
708+ let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in {
668709 defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
669710 defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>;
670711 }
@@ -682,13 +723,13 @@ foreach mx = SchedMxListF in {
682723 // We use the worst-case, vfdiv.vf is penalized in e64
683724 // TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes
684725 defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12);
685- defvar VFDivFLatAndOcc = !mul(GetOneM1AndDouble <mx>.c, VFDivFFactor);
726+ defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble <mx>.c, VFDivFFactor);
686727 let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in {
687728 defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>;
688729 }
689730
690731 defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40);
691- defvar VFDivVLatAndOcc = !mul(GetOneM1AndDouble <mx>.c, VFDivVFactor);
732+ defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble <mx>.c, VFDivVFactor);
692733 let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in {
693734 defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>;
694735 }
@@ -701,7 +742,7 @@ foreach mx = SchedMxListF in {
701742 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
702743
703744 defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40);
704- defvar VFSqrtVLatAndOcc = !mul(GetOneM1AndDouble <mx>.c, VFSqrtVFactor);
745+ defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble <mx>.c, VFSqrtVFactor);
705746 let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in {
706747 defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>;
707748 }
0 commit comments