@@ -22,6 +22,8 @@ class SiFiveP400IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit is
2222 bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2323}
2424
25+ defvar SiFiveP400VLEN = 128;
26+
2527// 1 Micro-Op per cycle.
2628class SiFiveP400GetLMulCycles<string mx> {
2729 int c = !cond(
@@ -35,19 +37,31 @@ class SiFiveP400GetLMulCycles<string mx> {
3537 );
3638}
3739
38- // Latency for segmented loads and stores are calculated as vl * nf.
39- class SiFiveP400GetCyclesSegmented<string mx, int sew, int nf> {
40- defvar VLEN = 128;
41- defvar VLUpperBound = !cond(
42- !eq(mx, "M1") : !div(VLEN, sew),
43- !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
44- !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
45- !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
46- !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
47- !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
48- !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
40+ class SiFiveP400GetVLMAX<string mx, int sew> {
41+ defvar LMUL = SiFiveP400GetLMulCycles<mx>.c;
42+ int val = !cond(
43+ !eq(mx, "MF2") : !div(!div(SiFiveP400VLEN, 2), sew),
44+ !eq(mx, "MF4") : !div(!div(SiFiveP400VLEN, 4), sew),
45+ !eq(mx, "MF8") : !div(!div(SiFiveP400VLEN, 8), sew),
46+ true: !div(!mul(SiFiveP400VLEN, LMUL), sew)
47+ );
48+ }
49+
50+ class SiFiveP400StridedLdStLatency<string mx, int sew> {
51+ defvar VL = SiFiveP400GetVLMAX<mx, sew>.val;
52+ int val = !cond(
53+ !eq(VL, 2): 13,
54+ !eq(VL, 4): 18,
55+ !eq(VL, 8): 22,
56+ !eq(VL, 16): 30,
57+ // VL=32,64,128
58+ true: !sub(VL, 2)
4959 );
50- int c = !mul(VLUpperBound, nf);
60+ }
61+
62+ // Latency for segmented loads and stores are calculated as vl * nf.
63+ class SiFiveP400SegmentedLdStCycles<string mx, int sew, int nf> {
64+ int c = !mul(SiFiveP400GetVLMAX<mx, sew>.val, nf);
5165}
5266
5367// Both variants of floating point vector reductions are based on numbers collected
@@ -368,65 +382,45 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
368382def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
369383
370384// 7. Vector Loads and Stores
371- // FIXME: This unit is still being improved, currently
372- // it is based on stage numbers. Estimates are optimistic,
373- // latency may be longer.
374- foreach mx = SchedMxList in {
375- defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
376- defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
377- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
378- defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
379- defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase>;
380- defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
381- }
382- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
383- defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP400VLD], mx, IsWorstCase>;
384- defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP400VLD], mx, IsWorstCase>;
385- defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP400VLD], mx, IsWorstCase>;
386- defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP400VLD], mx, IsWorstCase>;
387- }
388- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
389- defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP400VLD], mx, IsWorstCase>;
390- defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
391- defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
392- defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
393- defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP400VLD], mx, IsWorstCase>;
394- defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
395- defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
396- defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
397- }
398- }
399385
386+ // Note that the latency of vector loads are measured by consuming the loaded
387+ // value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
400388foreach mx = SchedMxList in {
401389 defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
402390 defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
403- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
404- defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
405- defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase>;
406- }
407- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
408- defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP400VST], mx, IsWorstCase>;
409- defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP400VST], mx, IsWorstCase>;
410- defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP400VST], mx, IsWorstCase>;
411- defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP400VST], mx, IsWorstCase>;
391+ let Latency = 8 in {
392+ let ReleaseAtCycles = [LMulLat] in {
393+ defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
394+ defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
395+
396+ defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
397+ }
398+
399+ // Mask load and store have a maximum EMUL of 1.
400+ let ReleaseAtCycles = [SiFiveP400GetLMulCycles<"M1">.c] in {
401+ defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase=!eq(mx, "M1")>;
402+ defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase=!eq(mx, "M1")>;
403+ }
412404 }
413- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
414- defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP400VST], mx, IsWorstCase>;
415- defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
416- defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
417- defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
418- defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP400VST], mx, IsWorstCase>;
419- defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
420- defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
421- defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
405+ foreach eew = [8, 16, 32, 64] in {
406+ let Latency = SiFiveP400StridedLdStLatency<mx, eew>.val,
407+ ReleaseAtCycles = [SiFiveP400GetVLMAX<mx, eew>.val] in {
408+ defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
409+ defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
410+ defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
411+
412+ defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SiFiveP400VST], mx, IsWorstCase>;
413+ defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
414+ defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
415+ }
422416 }
423417}
424418
425419foreach mx = SchedMxList in {
426420 foreach nf=2-8 in {
427421 foreach eew = [8, 16, 32, 64] in {
428422 defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
429- defvar LMulLat = SiFiveP400GetCyclesSegmented <mx, eew, nf>.c;
423+ defvar LMulLat = SiFiveP400SegmentedLdStCycles <mx, eew, nf>.c;
430424 let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
431425 defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
432426 defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
0 commit comments