@@ -22,6 +22,8 @@ class SiFiveP400IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit is
2222 bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2323}
2424
25+ defvar SiFiveP400VLEN = 128;
26+
2527// 1 Micro-Op per cycle.
2628class SiFiveP400GetLMulCycles<string mx> {
2729 int c = !cond(
@@ -35,19 +37,19 @@ class SiFiveP400GetLMulCycles<string mx> {
3537 );
3638}
3739
38- // Latency for segmented loads and stores are calculated as vl * nf.
39- class SiFiveP400GetCyclesSegmented<string mx, int sew, int nf> {
40- defvar VLEN = 128;
41- defvar VLUpperBound = !cond(
42- !eq(mx, "M1") : !div(VLEN, sew),
43- !eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
44- !eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
45- !eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
46- !eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
47- !eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
48- !eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
40+ class SiFiveP400GetVLMAX<string mx, int sew> {
41+ defvar LMUL = SiFiveP400GetLMulCycles<mx>.c;
42+ int val = !cond(
43+ !eq(mx, "MF2") : !div(!div(SiFiveP400VLEN, 2), sew),
44+ !eq(mx, "MF4") : !div(!div(SiFiveP400VLEN, 4), sew),
45+ !eq(mx, "MF8") : !div(!div(SiFiveP400VLEN, 8), sew),
46+ true: !div(!mul(SiFiveP400VLEN, LMUL), sew)
4947 );
50- int c = !mul(VLUpperBound, nf);
48+ }
49+
50+ // Latency for segmented loads and stores are calculated as vl * nf.
51+ class SiFiveP400SegmentedLdStCycles<string mx, int sew, int nf> {
52+ int c = !mul(SiFiveP400GetVLMAX<mx, sew>.val, nf);
5153}
5254
5355// Both variants of floating point vector reductions are based on numbers collected
@@ -368,65 +370,44 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
368370def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
369371
370372// 7. Vector Loads and Stores
371- // FIXME: This unit is still being improved, currently
372- // it is based on stage numbers. Estimates are optimistic,
373- // latency may be longer.
374- foreach mx = SchedMxList in {
375- defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
376- defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
377- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
378- defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
379- defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase>;
380- defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
381- }
382- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
383- defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP400VLD], mx, IsWorstCase>;
384- defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP400VLD], mx, IsWorstCase>;
385- defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP400VLD], mx, IsWorstCase>;
386- defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP400VLD], mx, IsWorstCase>;
387- }
388- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
389- defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP400VLD], mx, IsWorstCase>;
390- defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
391- defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
392- defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
393- defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP400VLD], mx, IsWorstCase>;
394- defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
395- defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
396- defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
397- }
398- }
399373
374+ // Note that the latency of vector loads are measured by consuming the loaded
375+ // value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
400376foreach mx = SchedMxList in {
401377 defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
402378 defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
403- let Latency = 8, ReleaseAtCycles = [LMulLat] in {
404- defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
405- defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase>;
406- }
407- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
408- defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP400VST], mx, IsWorstCase>;
409- defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP400VST], mx, IsWorstCase>;
410- defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP400VST], mx, IsWorstCase>;
411- defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP400VST], mx, IsWorstCase>;
379+ let Latency = 8 in {
380+ let ReleaseAtCycles = [LMulLat] in {
381+ defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
382+ defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
383+
384+ defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
385+ }
386+
387+ // Mask load and store always have EMUL=1.
388+ let ReleaseAtCycles = [SiFiveP400GetLMulCycles<"M1">.c] in {
389+ defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase=!eq(mx, "M1")>;
390+ defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase=!eq(mx, "M1")>;
391+ }
412392 }
413- let Latency = 12, ReleaseAtCycles = [LMulLat] in {
414- defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP400VST], mx, IsWorstCase>;
415- defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
416- defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
417- defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
418- defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP400VST], mx, IsWorstCase>;
419- defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
420- defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
421- defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
393+ foreach eew = [8, 16, 32, 64] in {
394+ let Latency = 13, ReleaseAtCycles = [SiFiveP400GetVLMAX<mx, eew>.val] in {
395+ defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
396+ defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
397+ defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
398+
399+ defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SiFiveP400VST], mx, IsWorstCase>;
400+ defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
401+ defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
402+ }
422403 }
423404}
424405
425406foreach mx = SchedMxList in {
426407 foreach nf=2-8 in {
427408 foreach eew = [8, 16, 32, 64] in {
428409 defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
429- defvar LMulLat = SiFiveP400GetCyclesSegmented <mx, eew, nf>.c;
410+ defvar LMulLat = SiFiveP400SegmentedLdStCycles <mx, eew, nf>.c;
430411 let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
431412 defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
432413 defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
0 commit comments