13
13
//
14
14
//===----------------------------------------------------------------------===//
15
15
16
- //===----------------------------------------------------------------------===//
17
- // Helpers
18
-
19
- // Maps LMUL string to corresponding value from the Values array
20
- // LMUL values map to array indices as follows:
21
- // MF8 -> Values[0], MF4 -> Values[1], MF2 -> Values[2], M1 -> Values[3],
22
- // M2 -> Values[4], M4 -> Values[5], M8 -> Values[6]
23
- // Shorter lists are allowed, e.g., widening instructions don't work on M8
24
- class GetLMULValue<list<int> Values, string LMUL> {
25
- defvar Index = !cond(
26
- !eq(LMUL, "MF8"): 0,
27
- !eq(LMUL, "MF4"): 1,
28
- !eq(LMUL, "MF2"): 2,
29
- !eq(LMUL, "M1"): 3,
30
- !eq(LMUL, "M2"): 4,
31
- !eq(LMUL, "M4"): 5,
32
- !eq(LMUL, "M8"): 6,
33
- );
34
-
35
- assert !lt(Index, !size(Values)),
36
- "Missing LMUL value for '" # LMUL # "'. " #
37
- "Expected at least " # !add(Index, 1) # " elements, but got " #
38
- !size(Values) # ".";
39
-
40
- int c = Values[Index];
16
+ class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
17
+ string LLMUL = LargestLMUL<MxList>.r;
18
+ bit c = !eq(mx, LLMUL);
41
19
}
42
20
43
- // Returns BaseValue for LMUL values before startLMUL, Value for startLMUL,
44
- // then doubles Value for each subsequent LMUL
45
- // Example: ConstValueUntilLMULThenDoubleBase<"M1", 2, 4, "M8"> returns:
46
- // MF8->2, MF4->2, MF2->2, M1->4, M2->8, M4->16, M8->32
47
- // This is useful for modeling scheduling parameters that scale with LMUL.
48
- class ConstValueUntilLMULThenDoubleBase<string startLMUL, int BaseValue, int Value, string currentLMUL> {
49
- assert !le(BaseValue, Value), "BaseValue must be le to Value";
50
- defvar startPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], startLMUL>.c;
51
- defvar currentPos = GetLMULValue<[0, 1, 2, 3, 4, 5, 6], currentLMUL>.c;
21
+ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
22
+ string LLMUL = LargestLMUL<MxList>.r;
23
+ int SSEW = SmallestSEW<mx, isF>.r;
24
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
25
+ }
52
26
53
- // Calculate the difference in positions
54
- defvar posDiff = !sub(currentPos, startPos );
27
+ defvar SMX60VLEN = 256;
28
+ defvar SMX60DLEN = !div(SMX60VLEN, 2 );
55
29
56
- // Calculate Value * (2^posDiff) using shift left
30
+ class Get1248Latency<string mx> {
57
31
int c = !cond(
58
- !lt(posDiff, 0) : BaseValue,
59
- !eq(posDiff, 0) : Value,
60
- true: !mul(Value, !shl(1, posDiff))
32
+ !eq(mx, "M2") : 2,
33
+ !eq(mx, "M4") : 4,
34
+ !eq(mx, "M8") : 8,
35
+ true: 1
61
36
);
62
37
}
63
38
64
- // Same as the previous function but BaseValue == Value
65
- class ConstValueUntilLMULThenDouble<string startLMUL, int Value, string currentLMUL> {
66
- int c = ConstValueUntilLMULThenDoubleBase<startLMUL, Value, Value, currentLMUL>.c;
67
- }
68
-
69
- // Returns MF8->1, MF4->1, MF2->2, M1->4, M2->8, M4->16, M8->32
70
- class ConstOneUntilMF4ThenDouble<string mx> {
71
- int c = ConstValueUntilLMULThenDouble<"MF4", 1, mx>.c;
72
- }
73
-
74
- // Returns MF8->1, MF4->1, MF2->1, M1->2, M2->4, M4->8, M8->16
75
- class ConstOneUntilMF2ThenDouble<string mx> {
76
- int c = ConstValueUntilLMULThenDouble<"MF2", 1, mx>.c;
77
- }
78
-
79
- // Returns MF8->1, MF4->1, MF2->1, M1->1, M2->2, M4->4, M8->8
80
- class ConstOneUntilM1ThenDouble<string mx> {
81
- int c = ConstValueUntilLMULThenDouble<"M1", 1, mx>.c;
39
+ // Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
40
+ class Get4816Latency<string mx> {
41
+ int c = !cond(
42
+ !eq(mx, "M4") : 8,
43
+ !eq(mx, "M8") : 16,
44
+ true: 4
45
+ );
82
46
}
83
47
84
- //===----------------------------------------------------------------------===//
85
- // Latency helper classes
86
-
87
48
// Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
88
- class Get4458Latency<string mx> {
89
- int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/4, /*M4=*/5, /*M8=*/8], mx>.c;
49
+ class Get458Latency<string mx> {
50
+ int c = !cond(
51
+ !eq(mx, "M4") : 5,
52
+ !eq(mx, "M8") : 8,
53
+ true: 4
54
+ );
90
55
}
91
56
92
- // Used for: widening operations (no M8)
57
+ // Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
58
+ // Used for: widening operations
93
59
class Get4588Latency<string mx> {
94
- int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/5, /*M4=*/8], mx>.c;
60
+ int c = !cond(
61
+ !eq(mx, "M2") : 5,
62
+ !eq(mx, "M4") : 8,
63
+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
64
+ true: 4
65
+ );
95
66
}
96
67
97
68
// Used for: mask-producing comparisons, carry ops with mask, FP comparisons
98
69
class Get461018Latency<string mx> {
99
- int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c;
100
- }
101
-
102
- //===----------------------------------------------------------------------===//
103
-
104
- class SMX60IsWorstCaseMX<string mx, list<string> MxList> {
105
- string LLMUL = LargestLMUL<MxList>.r;
106
- bit c = !eq(mx, LLMUL);
70
+ int c = !cond(
71
+ !eq(mx, "M2") : 6,
72
+ !eq(mx, "M4") : 10,
73
+ !eq(mx, "M8") : 18,
74
+ true: 4
75
+ );
107
76
}
108
77
109
- class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0> {
110
- string LLMUL = LargestLMUL<MxList>.r;
111
- int SSEW = SmallestSEW<mx, isF>.r;
112
- bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
78
+ // Used for: e64 multiply pattern, complex ops
79
+ class Get781632Latency<string mx> {
80
+ int c = !cond(
81
+ !eq(mx, "M2") : 8,
82
+ !eq(mx, "M4") : 16,
83
+ !eq(mx, "M8") : 32,
84
+ true: 7
85
+ );
113
86
}
114
87
115
- defvar SMX60VLEN = 256;
116
- defvar SMX60DLEN = !div(SMX60VLEN, 2);
117
-
118
88
def SpacemitX60Model : SchedMachineModel {
119
89
let IssueWidth = 2; // dual-issue
120
90
let MicroOpBufferSize = 0; // in-order
@@ -413,13 +383,12 @@ foreach LMul = [1, 2, 4, 8] in {
413
383
foreach mx = SchedMxList in {
414
384
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
415
385
416
- let Latency = Get4458Latency <mx>.c, ReleaseAtCycles = [4] in {
386
+ let Latency = Get458Latency <mx>.c, ReleaseAtCycles = [4] in {
417
387
defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
418
388
defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
419
389
}
420
390
421
- defvar VIALULat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c;
422
- let Latency = VIALULat, ReleaseAtCycles = [4] in {
391
+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
423
392
// Pattern of vadd, vsub, vrsub: 4/4/5/8
424
393
// Pattern of vand, vor, vxor: 4/4/8/16
425
394
// They are grouped together, so we used the worst case 4/4/8/16
@@ -456,7 +425,7 @@ foreach mx = SchedMxList in {
456
425
// Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
457
426
// e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
458
427
// TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
459
- let Latency = ConstValueUntilLMULThenDoubleBase<"M2", 7, 8, mx>.c, ReleaseAtCycles = [7] in {
428
+ let Latency = Get781632Latency< mx>.c, ReleaseAtCycles = [7] in {
460
429
defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
461
430
defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
462
431
defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
@@ -492,8 +461,15 @@ foreach mx = SchedMxList in {
492
461
foreach sew = SchedSEWSet<mx>.val in {
493
462
defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
494
463
495
- defvar VIDivLat = ConstValueUntilLMULThenDouble<"MF2", 12, mx>.c;
496
- let Latency = VIDivLat, ReleaseAtCycles = [12] in {
464
+ // Slightly reduced for fractional LMULs
465
+ defvar Multiplier = !cond(
466
+ !eq(mx, "MF8") : 12,
467
+ !eq(mx, "MF4") : 12,
468
+ !eq(mx, "MF2") : 12,
469
+ true: 24
470
+ );
471
+
472
+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
497
473
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
498
474
defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
499
475
}
@@ -504,8 +480,14 @@ foreach mx = SchedMxList in {
504
480
foreach mx = SchedMxListW in {
505
481
defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
506
482
507
- defvar VNarrowingLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c;
508
- let Latency = VNarrowingLat, ReleaseAtCycles = [4] in {
483
+ // Slightly increased for integer LMULs
484
+ defvar Multiplier = !cond(
485
+ !eq(mx, "M2") : 2,
486
+ !eq(mx, "M4") : 2,
487
+ true: 1
488
+ );
489
+
490
+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
509
491
defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
510
492
defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
511
493
defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
0 commit comments