@@ -750,39 +750,82 @@ foreach mx = SchedMxList in {
750750}
751751
752752// 16. Vector Permutation Instructions
753+ // Slide
753754foreach mx = SchedMxList in {
754755 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
755756
756- defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
757+ // Latency for slide up: 4/4/8/16, ReleaseAtCycles is 2/4/8/16
758+ defvar VSlideUpLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
759+ defvar VSlideUpOcc = ConstOneUntilMF2ThenDouble<mx>.c;
760+ let Latency = VSlideUpLat, ReleaseAtCycles =[VSlideUpOcc] in {
761+ defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
762+ }
757763
758- defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
759- defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
764+ // Latency for slide down: 4/5/9/17, ReleaseAtCycles is 3/5/9/17
765+ defvar VSlideDownLat = GetLMULValue<[4, 4, 4, 4, 5, 9, 17], mx>.c;
766+ defvar VSlideDownOcc = GetLMULValue<[1, 1, 1, 3, 5, 9, 17], mx>.c;
767+ let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
768+ defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
769+ }
770+ // The following group slide up and down together, so we use the worst-case
771+ // (slide down) for all.
772+ let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
773+ defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
774+ defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
760775
761- defm "" : LMULWriteResMX<"WriteVSlideUpX ", [SMX60_VIEU ], mx, IsWorstCase>;
762- defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
776+ defm "" : LMULWriteResMX<"WriteVFSlide1F ", [SMX60_VFP ], mx, IsWorstCase>;
777+ }
763778}
764779
765- def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
766- def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
767-
768- def : WriteRes<WriteVMovFS , [SMX60_VIEU]>;
769- def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
780+ // ReleaseAtCycles is 2/2/2/2/2/3/6, but we can't set based on MX for now
781+ // TODO: Split this into separate WriteRes for each MX
782+ let Latency = 6, ReleaseAtCycles = [6] in {
783+ def : WriteRes<WriteVMovXS , [SMX60_VIEU]>;
784+ }
770785
771- // Gather and Compress
772- foreach mx = SchedMxList in {
773- foreach sew = SchedSEWSet<mx>.val in {
774- defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
775- defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
776- defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
777- defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
778- }
786+ // ReleaseAtCycles is 1/1/1/1/1/2/4, but we can't set based on MX for now
787+ // TODO: Split this into separate WriteRes for each MX
788+ let Latency = 4, ReleaseAtCycles = [4] in {
789+ def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
790+ def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
791+ def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
779792}
780793
794+ // Integer LMUL Gather and Compress
781795foreach mx = SchedMxList in {
782796 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
783797
784- defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
785- defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
798+ defvar VRGatherLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
799+ let Latency = VRGatherLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
800+ defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
801+ defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
802+ }
803+
804+ foreach sew = SchedSEWSet<mx>.val in {
805+ defvar IsWorstCaseSEW = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
806+
807+ defvar VRGatherVVLat = GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c;
808+ defvar VRGatherVVOcc = GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c;
809+ let Latency = VRGatherVVLat, ReleaseAtCycles = [VRGatherVVOcc] in {
810+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
811+ }
812+ // For sew == 8, latency is half of the other cases, except for the fractional LMULs (const 4 cycles)
813+ defvar VRGatherEI16Lat = !if(!eq(sew, 8),
814+ GetLMULValue<[4, 4, 4, 8, 32, 128, 256], mx>.c,
815+ GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c);
816+ defvar VRGatherEI16Occ = !if(!eq(sew, 8),
817+ GetLMULValue<[1, 1, 2, 8, 32, 128, 256], mx>.c,
818+ GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c);
819+ let Latency = VRGatherEI16Lat, ReleaseAtCycles = [VRGatherEI16Occ] in {
820+ defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
821+ }
822+
823+ defvar VCompressVLat = GetLMULValue<[4, 4, 4, 4, 10, 36, 136], mx>.c;
824+ defvar VCompressVOcc = GetLMULValue<[1, 1, 1, 3, 10, 36, 136], mx>.c;
825+ let Latency = VCompressVLat, ReleaseAtCycles = [VCompressVOcc] in {
826+ defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
827+ }
828+ }
786829}
787830
788831// Others
0 commit comments