@@ -24,6 +24,87 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
2424 bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2525}
2626
27+ defvar SMX60VLEN = 256;
28+ defvar SMX60DLEN = !div(SMX60VLEN, 2);
29+
30+ class Get1248Latency<string mx> {
31+ int c = !cond(
32+ !eq(mx, "M1") : 1,
33+ !eq(mx, "M2") : 2,
34+ !eq(mx, "M4") : 4,
35+ !eq(mx, "M8") : 8,
36+ !eq(mx, "MF2") : 1,
37+ !eq(mx, "MF4") : 1,
38+ !eq(mx, "MF8") : 1
39+ );
40+ }
41+
42+ // Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
43+ class Get44816Latency<string mx> {
44+ int c = !cond(
45+ !eq(mx, "M1") : 4,
46+ !eq(mx, "M2") : 4,
47+ !eq(mx, "M4") : 8,
48+ !eq(mx, "M8") : 16,
49+ !eq(mx, "MF2") : 4,
50+ !eq(mx, "MF4") : 4,
51+ !eq(mx, "MF8") : 4
52+ );
53+ }
54+
55+ // Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
56+ class Get4458Latency<string mx> {
57+ int c = !cond(
58+ !eq(mx, "M1") : 4,
59+ !eq(mx, "M2") : 4,
60+ !eq(mx, "M4") : 5,
61+ !eq(mx, "M8") : 8,
62+ !eq(mx, "MF2") : 4,
63+ !eq(mx, "MF4") : 4,
64+ !eq(mx, "MF8") : 4
65+ );
66+ }
67+
68+ // Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
69+ // Used for: widening operations
70+ class Get4588Latency<string mx> {
71+ int c = !cond(
72+ !eq(mx, "M1") : 4,
73+ !eq(mx, "M2") : 5,
74+ !eq(mx, "M4") : 8,
75+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
76+ !eq(mx, "MF2") : 4,
77+ !eq(mx, "MF4") : 4,
78+ !eq(mx, "MF8") : 4
79+ );
80+ }
81+
82+ // Used for: mask-producing comparisons, carry ops with mask, FP comparisons
83+ class Get461018Latency<string mx> {
84+ int c = !cond(
85+ !eq(mx, "M1") : 4,
86+ !eq(mx, "M2") : 6,
87+ !eq(mx, "M4") : 10,
88+ !eq(mx, "M8") : 18,
89+ !eq(mx, "MF2") : 4,
90+ !eq(mx, "MF4") : 4,
91+ !eq(mx, "MF8") : 4
92+ );
93+ }
94+
95+ // Used for: e64 multiply pattern, complex ops
96+ class Get781632Latency<string mx> {
97+ int c = !cond(
98+ !eq(mx, "M1") : 7,
99+ !eq(mx, "M2") : 8,
100+ !eq(mx, "M4") : 16,
101+ !eq(mx, "M8") : 32,
102+ !eq(mx, "MF2") : 7,
103+ !eq(mx, "MF4") : 7,
104+ !eq(mx, "MF8") : 7
105+ );
106+ }
107+
27108def SpacemitX60Model : SchedMachineModel {
28109 let IssueWidth = 2; // dual-issue
29110 let MicroOpBufferSize = 0; // in-order
@@ -322,71 +403,120 @@ foreach LMul = [1, 2, 4, 8] in {
322403foreach mx = SchedMxList in {
323404 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
324405
325- defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
326- defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
327- defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
328- defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
329- defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
330- defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
331- defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
332- defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
333- defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
334- defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
335- defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
336- defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
337- defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
338- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
339- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
340- defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
341- defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
342- defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
343- defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
344- defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
345- defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
346-
347- defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
348- defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
349- defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
350-
351- defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
352- defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
353- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
354- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
406+ let Latency = Get4458Latency<mx>.c, ReleaseAtCycles = [Get1248Latency<mx>.c] in {
407+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
408+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
409+ }
410+
411+ let Latency = Get44816Latency<mx>.c, ReleaseAtCycles = [Get44816Latency<mx>.c] in {
412+ // Pattern of vadd, vsub, vrsub: 4/4/5/8
413+ // Pattern of vand, vor, vxor: 4/4/8/16
414+ // They are grouped together, so we used the worst case 4/4/5/16
415+ // TODO: use InstRW to override individual instructions' scheduling data
416+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
417+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
418+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
419+
420+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
421+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
422+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
423+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
424+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
425+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
426+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
427+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
428+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
429+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
430+
431+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
432+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
433+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
434+ }
435+
436+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [Get461018Latency<mx>.c] in {
437+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
438+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
439+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
440+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
441+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
442+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
443+ }
444+
445+ // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
446+ // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
447+ // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
448+ let Latency = Get781632Latency<mx>.c in {
449+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
450+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
451+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
452+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
453+ }
355454}
356455
357456// Widening
457+ // Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
458+ // We use the worst-case for all.
358459foreach mx = SchedMxListW in {
359460 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
360461
361- defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
362- defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
363- defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
364- defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
365- defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
366- defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
367- defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
462+ defvar WideningLat = Get4588Latency<mx>.c;
463+ let Latency = WideningLat, ReleaseAtCycles = [Get1248Latency<mx>.c] in {
464+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
465+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
466+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
467+ }
468+ let Latency = WideningLat in {
469+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
470+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
471+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
472+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
473+ }
368474}
369475
370- // Vector Integer Division and Remainder
476+ // Division and remainder operations
477+ // Pattern of vdivu: 11/11/11/20/40/80/160
478+ // Pattern of vdiv: 12/12/12/22/44/88/176
479+ // Pattern of vremu: 12/12/12/22/44/88/176
480+ // Pattern of vrem: 13/13/13/24/48/96/192
481+ // We use the worst-case for all: 24/24/24/24/48/96/192
482+ // TODO: Create separate WriteVIRem to more closely match the latencies
371483foreach mx = SchedMxList in {
372484 foreach sew = SchedSEWSet<mx>.val in {
373485 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
374486
375- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
376- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
487+ let Latency = !mul(Get1248Latency<mx>.c, 24) in {
488+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
489+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
490+ }
377491 }
378492}
379493
380- // Narrowing Shift and Clips
381- foreach mx = SchedMxListW in {
494+ // Fractional LMUL Narrowing Shift and Clips
495+ foreach mx = ["MF8", "MF4", "MF2", "M1"] in {
382496 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
383497
384- defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
385- defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
386- defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
387- defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
388- defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
389- defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
498+ let Latency = Get44816Latency<mx>.c in {
499+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
500+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
501+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
502+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
503+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
504+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
505+ }
506+ }
507+
508+ // Integer LMUL Narrowing Shift and Clips
509+ foreach mx = ["M2", "M4"] in {
510+ defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
511+
512+ let Latency = !mul(Get44816Latency<mx>.c, 2) in {
513+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
514+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
515+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
516+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
517+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
518+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
519+ }
390520}
391521
392522// 12. Vector Fixed-Point Arithmetic Instructions
0 commit comments