@@ -24,6 +24,67 @@ class SMX60IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit isF = 0
2424 bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2525}
2626
27+ defvar SMX60VLEN = 256;
28+ defvar SMX60DLEN = !div(SMX60VLEN, 2);
29+
30+ class Get1248Latency<string mx> {
31+ int c = !cond(
32+ !eq(mx, "M2") : 2,
33+ !eq(mx, "M4") : 4,
34+ !eq(mx, "M8") : 8,
35+ true: 1
36+ );
37+ }
38+
39+ // Used for: logical opsz, shifts, sign ext, merge/move, FP sign/recip/convert, mask ops, slides
40+ class Get4816Latency<string mx> {
41+ int c = !cond(
42+ !eq(mx, "M4") : 8,
43+ !eq(mx, "M8") : 16,
44+ true: 4
45+ );
46+ }
47+
48+ // Used for: arithmetic (add/sub/min/max), saturating/averaging, FP add/sub/min/max
49+ class Get458Latency<string mx> {
50+ int c = !cond(
51+ !eq(mx, "M4") : 5,
52+ !eq(mx, "M8") : 8,
53+ true: 4
54+ );
55+ }
56+
57+ // Widening scaling pattern (4,4,4,4,5,8,8): plateaus at higher LMULs
58+ // Used for: widening operations
59+ class Get4588Latency<string mx> {
60+ int c = !cond(
61+ !eq(mx, "M2") : 5,
62+ !eq(mx, "M4") : 8,
63+ !eq(mx, "M8") : 8, // M8 not supported for most widening, fallback
64+ true: 4
65+ );
66+ }
67+
68+ // Used for: mask-producing comparisons, carry ops with mask, FP comparisons
69+ class Get461018Latency<string mx> {
70+ int c = !cond(
71+ !eq(mx, "M2") : 6,
72+ !eq(mx, "M4") : 10,
73+ !eq(mx, "M8") : 18,
74+ true: 4
75+ );
76+ }
77+
78+ // Used for: e64 multiply pattern, complex ops
79+ class Get781632Latency<string mx> {
80+ int c = !cond(
81+ !eq(mx, "M2") : 8,
82+ !eq(mx, "M4") : 16,
83+ !eq(mx, "M8") : 32,
84+ true: 7
85+ );
86+ }
87+
2788def SpacemitX60Model : SchedMachineModel {
2889 let IssueWidth = 2; // dual-issue
2990 let MicroOpBufferSize = 0; // in-order
@@ -322,71 +383,118 @@ foreach LMul = [1, 2, 4, 8] in {
322383foreach mx = SchedMxList in {
323384 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c;
324385
325- defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
326- defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
327- defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
328- defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
329- defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
330- defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
331- defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
332- defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
333- defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
334- defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
335- defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
336- defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
337- defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
338- defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
339- defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
340- defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
341- defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
342- defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
343- defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
344- defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
345- defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
346-
347- defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
348- defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
349- defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
350-
351- defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
352- defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
353- defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
354- defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
386+ let Latency = Get458Latency<mx>.c, ReleaseAtCycles = [4] in {
387+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [SMX60_VIEU], mx, IsWorstCase>;
388+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [SMX60_VIEU], mx, IsWorstCase>;
389+ }
390+
391+ let Latency = Get4816Latency<mx>.c, ReleaseAtCycles = [4] in {
392+ // Pattern of vadd, vsub, vrsub: 4/4/5/8
393+ // Pattern of vand, vor, vxor: 4/4/8/16
394+ // They are grouped together, so we used the worst case 4/4/8/16
395+ // TODO: use InstRW to override individual instructions' scheduling data
396+ defm "" : LMULWriteResMX<"WriteVIALUV", [SMX60_VIEU], mx, IsWorstCase>;
397+ defm "" : LMULWriteResMX<"WriteVIALUX", [SMX60_VIEU], mx, IsWorstCase>;
398+ defm "" : LMULWriteResMX<"WriteVIALUI", [SMX60_VIEU], mx, IsWorstCase>;
399+
400+ defm "" : LMULWriteResMX<"WriteVExtV", [SMX60_VIEU], mx, IsWorstCase>;
401+ defm "" : LMULWriteResMX<"WriteVIMergeV", [SMX60_VIEU], mx, IsWorstCase>;
402+ defm "" : LMULWriteResMX<"WriteVIMergeX", [SMX60_VIEU], mx, IsWorstCase>;
403+ defm "" : LMULWriteResMX<"WriteVIMergeI", [SMX60_VIEU], mx, IsWorstCase>;
404+ defm "" : LMULWriteResMX<"WriteVIMovV", [SMX60_VIEU], mx, IsWorstCase>;
405+ defm "" : LMULWriteResMX<"WriteVIMovX", [SMX60_VIEU], mx, IsWorstCase>;
406+ defm "" : LMULWriteResMX<"WriteVIMovI", [SMX60_VIEU], mx, IsWorstCase>;
407+ defm "" : LMULWriteResMX<"WriteVShiftV", [SMX60_VIEU], mx, IsWorstCase>;
408+ defm "" : LMULWriteResMX<"WriteVShiftX", [SMX60_VIEU], mx, IsWorstCase>;
409+ defm "" : LMULWriteResMX<"WriteVShiftI", [SMX60_VIEU], mx, IsWorstCase>;
410+
411+ defm "" : LMULWriteResMX<"WriteVICALUV", [SMX60_VIEU], mx, IsWorstCase>;
412+ defm "" : LMULWriteResMX<"WriteVICALUX", [SMX60_VIEU], mx, IsWorstCase>;
413+ defm "" : LMULWriteResMX<"WriteVICALUI", [SMX60_VIEU], mx, IsWorstCase>;
414+ }
415+
416+ let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [4] in {
417+ defm "" : LMULWriteResMX<"WriteVICALUMV", [SMX60_VIEU], mx, IsWorstCase>;
418+ defm "" : LMULWriteResMX<"WriteVICALUMX", [SMX60_VIEU], mx, IsWorstCase>;
419+ defm "" : LMULWriteResMX<"WriteVICALUMI", [SMX60_VIEU], mx, IsWorstCase>;
420+ defm "" : LMULWriteResMX<"WriteVICmpV", [SMX60_VIEU], mx, IsWorstCase>;
421+ defm "" : LMULWriteResMX<"WriteVICmpX", [SMX60_VIEU], mx, IsWorstCase>;
422+ defm "" : LMULWriteResMX<"WriteVICmpI", [SMX60_VIEU], mx, IsWorstCase>;
423+ }
424+
425+ // Pattern of vmacc, vmadd, vmul, vmulh, etc.: e8/e16 = 4/4/5/8, e32 = 5,5,5,8,
426+ // e64 = 7,8,16,32. We use the worst-case until we can split the SEW.
427+ // TODO: change WriteVIMulV, etc to be defined with LMULSEWSchedWrites
428+ let Latency = Get781632Latency<mx>.c, ReleaseAtCycles = [7] in {
429+ defm "" : LMULWriteResMX<"WriteVIMulV", [SMX60_VIEU], mx, IsWorstCase>;
430+ defm "" : LMULWriteResMX<"WriteVIMulX", [SMX60_VIEU], mx, IsWorstCase>;
431+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
432+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
433+ }
355434}
356435
357436// Widening
437+ // Pattern of vwmul, vwmacc, etc: e8/e16 = 4/4/5/8, e32 = 5,5,5,8
438+ // We use the worst-case for all.
358439foreach mx = SchedMxListW in {
359440 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
360441
361- defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
362- defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
363- defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
364- defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
365- defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
366- defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
367- defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
442+ let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [4] in {
443+ defm "" : LMULWriteResMX<"WriteVIWALUV", [SMX60_VIEU], mx, IsWorstCase>;
444+ defm "" : LMULWriteResMX<"WriteVIWALUX", [SMX60_VIEU], mx, IsWorstCase>;
445+ defm "" : LMULWriteResMX<"WriteVIWALUI", [SMX60_VIEU], mx, IsWorstCase>;
446+ defm "" : LMULWriteResMX<"WriteVIWMulV", [SMX60_VIEU], mx, IsWorstCase>;
447+ defm "" : LMULWriteResMX<"WriteVIWMulX", [SMX60_VIEU], mx, IsWorstCase>;
448+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [SMX60_VIEU], mx, IsWorstCase>;
449+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [SMX60_VIEU], mx, IsWorstCase>;
450+ }
368451}
369452
370- // Vector Integer Division and Remainder
453+ // Division and remainder operations
454+ // Pattern of vdivu: 11/11/11/20/40/80/160
455+ // Pattern of vdiv: 12/12/12/22/44/88/176
456+ // Pattern of vremu: 12/12/12/22/44/88/176
457+ // Pattern of vrem: 13/13/13/24/48/96/192
458+ // We use for all: 12/12/12/24/48/96/192
459+ // TODO: Create separate WriteVIRem to more closely match the latencies
371460foreach mx = SchedMxList in {
372461 foreach sew = SchedSEWSet<mx>.val in {
373462 defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
374463
375- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
376- defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
464+ // Slightly reduced for fractional LMULs
465+ defvar Multiplier = !cond(
466+ !eq(mx, "MF8") : 12,
467+ !eq(mx, "MF4") : 12,
468+ !eq(mx, "MF2") : 12,
469+ true: 24
470+ );
471+
472+ let Latency = !mul(Get1248Latency<mx>.c, Multiplier), ReleaseAtCycles = [12] in {
473+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SMX60_VIEU], mx, sew, IsWorstCase>;
474+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [SMX60_VIEU], mx, sew, IsWorstCase>;
475+ }
377476 }
378477}
379478
380479// Narrowing Shift and Clips
381480foreach mx = SchedMxListW in {
382481 defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c;
383482
384- defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
385- defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
386- defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
387- defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
388- defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
389- defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
483+ // Slightly increased for integer LMULs
484+ defvar Multiplier = !cond(
485+ !eq(mx, "M2") : 2,
486+ !eq(mx, "M4") : 2,
487+ true: 1
488+ );
489+
490+ let Latency = !mul(Get4816Latency<mx>.c, Multiplier), ReleaseAtCycles = [4] in {
491+ defm "" : LMULWriteResMX<"WriteVNShiftV", [SMX60_VIEU], mx, IsWorstCase>;
492+ defm "" : LMULWriteResMX<"WriteVNShiftX", [SMX60_VIEU], mx, IsWorstCase>;
493+ defm "" : LMULWriteResMX<"WriteVNShiftI", [SMX60_VIEU], mx, IsWorstCase>;
494+ defm "" : LMULWriteResMX<"WriteVNClipV", [SMX60_VIEU], mx, IsWorstCase>;
495+ defm "" : LMULWriteResMX<"WriteVNClipX", [SMX60_VIEU], mx, IsWorstCase>;
496+ defm "" : LMULWriteResMX<"WriteVNClipI", [SMX60_VIEU], mx, IsWorstCase>;
497+ }
390498}
391499
392500// 12. Vector Fixed-Point Arithmetic Instructions
0 commit comments