Skip to content

Commit 3f7ac6e

Browse files
[RISCV] Update SpacemiT-X60 vector permutation instructions latencies (#152738)
This PR adds hardware-measured latencies for all instructions defined in Section 16 of the RVV specification: "Vector Permutation Instructions" to the SpacemiT-X60 scheduling model. --------- Signed-off-by: Mikhail R. Gadelha <[email protected]>
1 parent 5b56816 commit 3f7ac6e

File tree

3 files changed

+684
-641
lines changed

3 files changed

+684
-641
lines changed

llvm/lib/Target/RISCV/RISCVSchedSpacemitX60.td

Lines changed: 63 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -750,39 +750,82 @@ foreach mx = SchedMxList in {
750750
}
751751

752752
// 16. Vector Permutation Instructions
753+
// Slide
753754
foreach mx = SchedMxList in {
754755
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
755756

756-
defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
757+
// Latency for slide up: 4/4/8/16, ReleaseAtCycles is 2/4/8/16
758+
defvar VSlideUpLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
759+
defvar VSlideUpOcc = ConstOneUntilMF2ThenDouble<mx>.c;
760+
let Latency = VSlideUpLat, ReleaseAtCycles =[VSlideUpOcc] in {
761+
defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
762+
}
757763

758-
defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
759-
defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
764+
// Latency for slide down: 4/5/9/17, ReleaseAtCycles is 3/5/9/17
765+
defvar VSlideDownLat = GetLMULValue<[4, 4, 4, 4, 5, 9, 17], mx>.c;
766+
defvar VSlideDownOcc = GetLMULValue<[1, 1, 1, 3, 5, 9, 17], mx>.c;
767+
let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
768+
defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
769+
}
770+
// The following group slide up and down together, so we use the worst-case
771+
// (slide down) for all.
772+
let Latency = VSlideDownLat, ReleaseAtCycles =[VSlideDownOcc] in {
773+
defm "" : LMULWriteResMX<"WriteVSlideI", [SMX60_VIEU], mx, IsWorstCase>;
774+
defm "" : LMULWriteResMX<"WriteVISlide1X", [SMX60_VIEU], mx, IsWorstCase>;
760775

761-
defm "" : LMULWriteResMX<"WriteVSlideUpX", [SMX60_VIEU], mx, IsWorstCase>;
762-
defm "" : LMULWriteResMX<"WriteVSlideDownX", [SMX60_VIEU], mx, IsWorstCase>;
776+
defm "" : LMULWriteResMX<"WriteVFSlide1F", [SMX60_VFP], mx, IsWorstCase>;
777+
}
763778
}
764779

765-
def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
766-
def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
767-
768-
def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
769-
def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
780+
// ReleaseAtCycles is 2/2/2/2/2/3/6, but we can't set based on MX for now
781+
// TODO: Split this into separate WriteRes for each MX
782+
let Latency = 6, ReleaseAtCycles = [6] in {
783+
def : WriteRes<WriteVMovXS, [SMX60_VIEU]>;
784+
}
770785

771-
// Gather and Compress
772-
foreach mx = SchedMxList in {
773-
foreach sew = SchedSEWSet<mx>.val in {
774-
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
775-
defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCase>;
776-
defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCase>;
777-
defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCase>;
778-
}
786+
// ReleaseAtCycles is 1/1/1/1/1/2/4, but we can't set based on MX for now
787+
// TODO: Split this into separate WriteRes for each MX
788+
let Latency = 4, ReleaseAtCycles = [4] in {
789+
def : WriteRes<WriteVMovSX, [SMX60_VIEU]>;
790+
def : WriteRes<WriteVMovFS, [SMX60_VIEU]>;
791+
def : WriteRes<WriteVMovSF, [SMX60_VIEU]>;
779792
}
780793

794+
// Integer LMUL Gather and Compress
781795
foreach mx = SchedMxList in {
782796
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
783797

784-
defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
785-
defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
798+
defvar VRGatherLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
799+
let Latency = VRGatherLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in {
800+
defm "" : LMULWriteResMX<"WriteVRGatherVX", [SMX60_VIEU], mx, IsWorstCase>;
801+
defm "" : LMULWriteResMX<"WriteVRGatherVI", [SMX60_VIEU], mx, IsWorstCase>;
802+
}
803+
804+
foreach sew = SchedSEWSet<mx>.val in {
805+
defvar IsWorstCaseSEW = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
806+
807+
defvar VRGatherVVLat = GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c;
808+
defvar VRGatherVVOcc = GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c;
809+
let Latency = VRGatherVVLat, ReleaseAtCycles = [VRGatherVVOcc] in {
810+
defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
811+
}
812+
// For sew == 8, latency is half of the other cases, except for the fractional LMULs (const 4 cycles)
813+
defvar VRGatherEI16Lat = !if(!eq(sew, 8),
814+
GetLMULValue<[4, 4, 4, 8, 32, 128, 256], mx>.c,
815+
GetLMULValue<[4, 4, 4, 4, 16, 64, 256], mx>.c);
816+
defvar VRGatherEI16Occ = !if(!eq(sew, 8),
817+
GetLMULValue<[1, 1, 2, 8, 32, 128, 256], mx>.c,
818+
GetLMULValue<[1, 1, 1, 4, 16, 64, 256], mx>.c);
819+
let Latency = VRGatherEI16Lat, ReleaseAtCycles = [VRGatherEI16Occ] in {
820+
defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherEI16VV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
821+
}
822+
823+
defvar VCompressVLat = GetLMULValue<[4, 4, 4, 4, 10, 36, 136], mx>.c;
824+
defvar VCompressVOcc = GetLMULValue<[1, 1, 1, 3, 10, 36, 136], mx>.c;
825+
let Latency = VCompressVLat, ReleaseAtCycles = [VCompressVOcc] in {
826+
defm "" : LMULSEWWriteResMXSEW<"WriteVCompressV", [SMX60_VIEU], mx, sew, IsWorstCaseSEW>;
827+
}
828+
}
786829
}
787830

788831
// Others

0 commit comments

Comments
 (0)