1313//
1414//===----------------------------------------------------------------------===//
1515
16- //===----------------------------------------------------------------------===//
17- // Helpers
18-
19- // Maps LMUL string to corresponding value from the Values array
20- // LMUL values map to array indices as follows:
21- // MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3],
22- // M2 -> Values[4], M4 -> Values[5], M8 -> Values[6]
23- // Shorter lists are allowed, e.g., widening instructions don't work on M8
24- class GetLMULValue<list<int> Values, string LMUL> {
25- defvar Index = !cond(
26- !eq(LMUL, "MF8"): 0,
27- !eq(LMUL, "MF4"): 1,
28- !eq(LMUL, "MF2"): 2,
29- !eq(LMUL, "M1"): 3,
30- !eq(LMUL, "M2"): 4,
31- !eq(LMUL, "M4"): 5,
32- !eq(LMUL, "M8"): 6,
33- );
34-
35- assert !lt(Index, !size(Values)),
36- "Missing LMUL value for '" # LMUL # "'. " #
37- "Expected at least " # !add(Index, 1) # " elements, but got " #
38- !size(Values) # ".";
39-
40- int c = Values[Index];
16+ class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
17+ string LLMUL = LargestLMUL<MxList>.r;
18+ bit c = !eq(mx, LLMUL);
4119}
4220
43- // Returns BaseValue for LMUL values before startLMUL, Value for startLMUL,
44- // then doubles Value for each subsequent LMUL
45- // Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns:
46- // MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32
47- // This is useful for modeling scheduling parameters that scale with LMUL.
48- class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> {
49- assert !le(BaseValue, Value), "BaseValue must be le to Value";
50- defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c;
51- defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c;
21+ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
22+ string LLMUL = LargestLMUL<MxList>.r;
23+ int SSEW = SmallestSEW<mx, isF>.r;
24+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
25+ }
5226
53- // Calculate the difference in positions
54- defvar posDiff = !sub(currentPos, startPos );
27+ defvar SMX60VLEN = 256;
28+ defvar SMX60DLEN = !div(SMX60VLEN, 2 );
5529
56- // Calculate Value * (2^posDiff) using shift left
30+ class Get1248Latency<string mx> {
5731 int c = !cond(
58- !lt(posDiff, 0) : BaseValue,
59- !eq(posDiff, 0) : Value,
60- true: !mul(Value, !shl(1, posDiff))
32+ !eq(mx, "M2") : 2,
33+ !eq(mx, "M4") : 4,
34+ !eq(mx, "M8") : 8,
35+ true: 1
6136 );
6237}
6338
64- // Same as the previous function but BaseValue == Value
65- class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> {
66- int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c;
67- }
68-
69- // Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32
70- class ConstOneUntilMF4ThenDouble<string mx> {
71- int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c;
72- }
73-
74- // Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16
75- class ConstOneUntilMF2ThenDouble<string mx> {
76- int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c;
77- }
78-
79- // Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8
80- class ConstOneUntilM1ThenDouble<string mx> {
81- int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c;
39+ // Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
40+ class Get4816Latency<string mx> {
41+ int c = !cond(
42+ !eq(mx, "M4") : 8,
43+ !eq(mx, "M8") : 16,
44+ true: 4
45+ );
8246}
8347
84- //===----------------------------------------------------------------------===//
85- // Latency helper classes
86-
8748// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
88- class Get4458Latency<string mx> {
89- int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c;
49+ class Get458Latency<string mx> {
50+ int c = !cond(
51+ !eq(mx, "M4") : 5,
52+ !eq(mx, "M8") : 8,
53+ true: 4
54+ );
9055}
9156
92- // Used for: widening operations (no M8)
57+ // Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
58+ // Used for: widening operations
9359class Get4588Latency<string mx> {
94- int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c;
60+ int c = !cond(
61+ !eq(mx, "M2") : 5,
62+ !eq(mx, "M4") : 8,
63+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
64+ true: 4
65+ );
9566}
9667
9768// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
9869class Get461018Latency<string mx> {
99- int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
100- }
101-
102- //===----------------------------------------------------------------------===//
103-
104- class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
105- string LLMUL = LargestLMUL<MxList>.r;
106- bit c = !eq(mx, LLMUL);
70+ int c = !cond(
71+ !eq(mx, "M2") : 6,
72+ !eq(mx, "M4") : 10,
73+ !eq(mx, "M8") : 18,
74+ true: 4
75+ );
10776}
10877
109- class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
110- string LLMUL = LargestLMUL<MxList>.r;
111- int SSEW = SmallestSEW<mx, isF>.r;
112- bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
78+ // Used for: e64 multiply pattern, complex ops
79+ class Get781632Latency<string mx> {
80+ int c = !cond(
81+ !eq(mx, "M2") : 8,
82+ !eq(mx, "M4") : 16,
83+ !eq(mx, "M8") : 32,
84+ true: 7
85+ );
11386}
11487
115- defvar SMX60VLEN = 256;
116- defvar SMX60DLEN = !div(SMX60VLEN, 2);
117-
11888def SpacemitX60Model : SchedMachineModel {
11989 let IssueWidth = 2; // dual-issue
12090 let MicroOpBufferSize = 0; // in-order
@@ -413,13 +383,12 @@ foreach LMul = [1, 2, 4, 8] in {
413383foreach mx = SchedMxList in {
414384 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
415385
416- let Latency = Get4458Latency <mx>.c, ReleaseAtCycles = [4] in {
386+ let Latency = Get458Latency <mx>.c, ReleaseAtCycles = [4] in {
417387 defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
418388 defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
419389 }
420390
421- defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
422- let Latency = VIALULat, ReleaseAtCycles = [4] in {
391+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
423392 // Pattern of vadd, vsub, vrsub: 4/4/5/8
424393 // Pattern of vand, vor, vxor: 4/4/8/16
425394 // They are grouped together, so we used the worst case 4/4/8/16
@@ -456,7 +425,7 @@ foreach mx = SchedMxList in {
456425 // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
457426 // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
458427 // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
459- let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in {
428+ let Latency = Get781632Latency< mx>.c, ReleaseAtCycles = [7] in {
460429 defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
461430 defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
462431 defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
@@ -492,8 +461,15 @@ foreach mx = SchedMxList in {
492461 foreach sew = SchedSEWSet<mx>.val in {
493462 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
494463
495- defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
496- let Latency = VIDivLat, ReleaseAtCycles = [12] in {
464+ // Slightly reduced for fractional LMULs
465+ defvar Multiplier = !cond(
466+ !eq(mx, "MF8") : 12,
467+ !eq(mx, "MF4") : 12,
468+ !eq(mx, "MF2") : 12,
469+ true: 24
470+ );
471+
472+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
497473 defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
498474 defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
499475 }
@@ -504,8 +480,14 @@ foreach mx = SchedMxList in {
504480foreach mx = SchedMxListW in {
505481 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
506482
507- defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
508- let Latency = VNarrowingLat, ReleaseAtCycles = [4] in {
483+ // Slightly increased for integer LMULs
484+ defvar Multiplier = !cond(
485+ !eq(mx, "M2") : 2,
486+ !eq(mx, "M4") : 2,
487+ true: 1
488+ );
489+
490+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
509491 defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
510492 defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
511493 defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
0 commit comments