Skip to content

Commit 43a5cbb

Browse files
committed
[RISCV] Update some of the RVV memory ops in P400 & P600 sched models
1 parent 44badc9 commit 43a5cbb

File tree

8 files changed

+2981
-118
lines changed

8 files changed

+2981
-118
lines changed

llvm/lib/Target/RISCV/RISCVSchedSiFiveP400.td

Lines changed: 40 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class SiFiveP400IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit is
2222
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2323
}
2424

25+
defvar SiFiveP400VLEN = 128;
26+
2527
// 1 Micro-Op per cycle.
2628
class SiFiveP400GetLMulCycles<string mx> {
2729
int c = !cond(
@@ -35,19 +37,19 @@ class SiFiveP400GetLMulCycles<string mx> {
3537
);
3638
}
3739

38-
// Latency for segmented loads and stores are calculated as vl * nf.
39-
class SiFiveP400GetCyclesSegmented<string mx, int sew, int nf> {
40-
defvar VLEN = 128;
41-
defvar VLUpperBound = !cond(
42-
!eq(mx, "M1") : !div(VLEN, sew),
43-
!eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
44-
!eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
45-
!eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
46-
!eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
47-
!eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
48-
!eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
40+
class SiFiveP400GetVLMAX<string mx, int sew> {
41+
defvar LMUL = SiFiveP400GetLMulCycles<mx>.c;
42+
int val = !cond(
43+
!eq(mx, "MF2") : !div(!div(SiFiveP400VLEN, 2), sew),
44+
!eq(mx, "MF4") : !div(!div(SiFiveP400VLEN, 4), sew),
45+
!eq(mx, "MF8") : !div(!div(SiFiveP400VLEN, 8), sew),
46+
true: !div(!mul(SiFiveP400VLEN, LMUL), sew)
4947
);
50-
int c = !mul(VLUpperBound, nf);
48+
}
49+
50+
// Latency for segmented loads and stores are calculated as vl * nf.
51+
class SiFiveP400SegmentedLdStCycles<string mx, int sew, int nf> {
52+
int c = !mul(SiFiveP400GetVLMAX<mx, sew>.val, nf);
5153
}
5254

5355
// Both variants of floating point vector reductions are based on numbers collected
@@ -368,65 +370,44 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP400SYS]>;
368370
def : WriteRes<WriteVSETVL, [SiFiveP400SYS]>;
369371

370372
// 7. Vector Loads and Stores
371-
// FIXME: This unit is still being improved, currently
372-
// it is based on stage numbers. Estimates are optimistic,
373-
// latency may be longer.
374-
foreach mx = SchedMxList in {
375-
defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
376-
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
377-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
378-
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
379-
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase>;
380-
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
381-
}
382-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
383-
defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP400VLD], mx, IsWorstCase>;
384-
defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP400VLD], mx, IsWorstCase>;
385-
defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP400VLD], mx, IsWorstCase>;
386-
defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP400VLD], mx, IsWorstCase>;
387-
}
388-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
389-
defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP400VLD], mx, IsWorstCase>;
390-
defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP400VLD], mx, IsWorstCase>;
391-
defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP400VLD], mx, IsWorstCase>;
392-
defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP400VLD], mx, IsWorstCase>;
393-
defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP400VLD], mx, IsWorstCase>;
394-
defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP400VLD], mx, IsWorstCase>;
395-
defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP400VLD], mx, IsWorstCase>;
396-
defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP400VLD], mx, IsWorstCase>;
397-
}
398-
}
399373

374+
// Note that the latency of vector loads are measured by consuming the loaded
375+
// value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
400376
foreach mx = SchedMxList in {
401377
defvar LMulLat = SiFiveP400GetLMulCycles<mx>.c;
402378
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
403-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
404-
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
405-
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase>;
406-
}
407-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
408-
defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP400VST], mx, IsWorstCase>;
409-
defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP400VST], mx, IsWorstCase>;
410-
defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP400VST], mx, IsWorstCase>;
411-
defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP400VST], mx, IsWorstCase>;
379+
let Latency = 8 in {
380+
let ReleaseAtCycles = [LMulLat] in {
381+
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP400VLD], mx, IsWorstCase>;
382+
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP400VLD], mx, IsWorstCase>;
383+
384+
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP400VST], mx, IsWorstCase>;
385+
}
386+
387+
// Mask load and store always have EMUL=1.
388+
let ReleaseAtCycles = [SiFiveP400GetLMulCycles<"M1">.c] in {
389+
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP400VLD], mx, IsWorstCase=!eq(mx, "M1")>;
390+
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP400VST], mx, IsWorstCase=!eq(mx, "M1")>;
391+
}
412392
}
413-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
414-
defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP400VST], mx, IsWorstCase>;
415-
defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP400VST], mx, IsWorstCase>;
416-
defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP400VST], mx, IsWorstCase>;
417-
defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP400VST], mx, IsWorstCase>;
418-
defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP400VST], mx, IsWorstCase>;
419-
defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP400VST], mx, IsWorstCase>;
420-
defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP400VST], mx, IsWorstCase>;
421-
defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP400VST], mx, IsWorstCase>;
393+
foreach eew = [8, 16, 32, 64] in {
394+
let Latency = 13, ReleaseAtCycles = [SiFiveP400GetVLMAX<mx, eew>.val] in {
395+
defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
396+
defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
397+
defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SiFiveP400VLD], mx, IsWorstCase>;
398+
399+
defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SiFiveP400VST], mx, IsWorstCase>;
400+
defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
401+
defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SiFiveP400VST], mx, IsWorstCase>;
402+
}
422403
}
423404
}
424405

425406
foreach mx = SchedMxList in {
426407
foreach nf=2-8 in {
427408
foreach eew = [8, 16, 32, 64] in {
428409
defvar IsWorstCase = SiFiveP400IsWorstCaseMX<mx, SchedMxList>.c;
429-
defvar LMulLat = SiFiveP400GetCyclesSegmented<mx, eew, nf>.c;
410+
defvar LMulLat = SiFiveP400SegmentedLdStCycles<mx, eew, nf>.c;
430411
let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
431412
defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;
432413
defm "" : LMULWriteResMX<"WriteVLSEGFF" # nf # "e" #eew, [SiFiveP400VLD], mx, IsWorstCase>;

llvm/lib/Target/RISCV/RISCVSchedSiFiveP600.td

Lines changed: 40 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class SiFiveP600IsWorstCaseMXSEW<string mx, int sew, list<string> MxList, bit is
2222
bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
2323
}
2424

25+
defvar SiFiveP600VLEN = 128;
26+
2527
// 1 Micro-Op per cycle.
2628
class SiFiveP600GetLMulCycles<string mx> {
2729
int c = !cond(
@@ -35,19 +37,19 @@ class SiFiveP600GetLMulCycles<string mx> {
3537
);
3638
}
3739

38-
// Latency for segmented loads and stores are calculated as vl * nf.
39-
class SiFiveP600GetCyclesSegmented<string mx, int sew, int nf> {
40-
defvar VLEN = 128;
41-
defvar VLUpperBound = !cond(
42-
!eq(mx, "M1") : !div(VLEN, sew),
43-
!eq(mx, "M2") : !div(!mul(VLEN, 2), sew),
44-
!eq(mx, "M4") : !div(!mul(VLEN, 4), sew),
45-
!eq(mx, "M8") : !div(!mul(VLEN, 8), sew),
46-
!eq(mx, "MF2") : !div(!div(VLEN, 2), sew),
47-
!eq(mx, "MF4") : !div(!div(VLEN, 4), sew),
48-
!eq(mx, "MF8") : !div(!div(VLEN, 8), sew),
40+
class SiFiveP600GetVLMAX<string mx, int sew> {
41+
defvar LMUL = SiFiveP600GetLMulCycles<mx>.c;
42+
int val = !cond(
43+
!eq(mx, "MF2") : !div(!div(SiFiveP600VLEN, 2), sew),
44+
!eq(mx, "MF4") : !div(!div(SiFiveP600VLEN, 4), sew),
45+
!eq(mx, "MF8") : !div(!div(SiFiveP600VLEN, 8), sew),
46+
true: !div(!mul(SiFiveP600VLEN, LMUL), sew)
4947
);
50-
int c = !mul(VLUpperBound, nf);
48+
}
49+
50+
// Latency for segmented loads and stores are calculated as vl * nf.
51+
class SiFiveP600SegmentedLdStCycles<string mx, int sew, int nf> {
52+
int c = !mul(SiFiveP600GetVLMAX<mx, sew>.val, nf);
5153
}
5254

5355
class SiFiveP600VSM3CCycles<string mx> {
@@ -544,64 +546,43 @@ def : WriteRes<WriteVSETIVLI, [SiFiveP600SYS]>;
544546
def : WriteRes<WriteVSETVL, [SiFiveP600SYS]>;
545547

546548
// 7. Vector Loads and Stores
547-
// FIXME: This unit is still being improved, currently
548-
// it is based on stage numbers. Estimates are optimistic,
549-
// latency may be longer.
550-
foreach mx = SchedMxList in {
551-
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
552-
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
553-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
554-
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP600VLD], mx, IsWorstCase>;
555-
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP600VLD], mx, IsWorstCase>;
556-
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP600VLD], mx, IsWorstCase>;
557-
}
558-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
559-
defm "" : LMULWriteResMX<"WriteVLDS8", [SiFiveP600VLD], mx, IsWorstCase>;
560-
defm "" : LMULWriteResMX<"WriteVLDS16", [SiFiveP600VLD], mx, IsWorstCase>;
561-
defm "" : LMULWriteResMX<"WriteVLDS32", [SiFiveP600VLD], mx, IsWorstCase>;
562-
defm "" : LMULWriteResMX<"WriteVLDS64", [SiFiveP600VLD], mx, IsWorstCase>;
563-
}
564-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
565-
defm "" : LMULWriteResMX<"WriteVLDUX8", [SiFiveP600VLD], mx, IsWorstCase>;
566-
defm "" : LMULWriteResMX<"WriteVLDUX16", [SiFiveP600VLD], mx, IsWorstCase>;
567-
defm "" : LMULWriteResMX<"WriteVLDUX32", [SiFiveP600VLD], mx, IsWorstCase>;
568-
defm "" : LMULWriteResMX<"WriteVLDUX64", [SiFiveP600VLD], mx, IsWorstCase>;
569-
defm "" : LMULWriteResMX<"WriteVLDOX8", [SiFiveP600VLD], mx, IsWorstCase>;
570-
defm "" : LMULWriteResMX<"WriteVLDOX16", [SiFiveP600VLD], mx, IsWorstCase>;
571-
defm "" : LMULWriteResMX<"WriteVLDOX32", [SiFiveP600VLD], mx, IsWorstCase>;
572-
defm "" : LMULWriteResMX<"WriteVLDOX64", [SiFiveP600VLD], mx, IsWorstCase>;
573-
}
574-
}
575549

550+
// Note that the latency of vector loads are measured by consuming the loaded
551+
// value with vmv.x.s before subtracting the latency of vmv.x.s from the number.
576552
foreach mx = SchedMxList in {
577553
defvar LMulLat = SiFiveP600GetLMulCycles<mx>.c;
578554
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
579-
let Latency = 8, ReleaseAtCycles = [LMulLat] in {
580-
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP600VST], mx, IsWorstCase>;
581-
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP600VST], mx, IsWorstCase>;
582-
}
583-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
584-
defm "" : LMULWriteResMX<"WriteVSTS8", [SiFiveP600VST], mx, IsWorstCase>;
585-
defm "" : LMULWriteResMX<"WriteVSTS16", [SiFiveP600VST], mx, IsWorstCase>;
586-
defm "" : LMULWriteResMX<"WriteVSTS32", [SiFiveP600VST], mx, IsWorstCase>;
587-
defm "" : LMULWriteResMX<"WriteVSTS64", [SiFiveP600VST], mx, IsWorstCase>;
555+
let Latency = 8 in {
556+
let ReleaseAtCycles = [LMulLat] in {
557+
defm "" : LMULWriteResMX<"WriteVLDE", [SiFiveP600VLD], mx, IsWorstCase>;
558+
defm "" : LMULWriteResMX<"WriteVLDFF", [SiFiveP600VLD], mx, IsWorstCase>;
559+
560+
defm "" : LMULWriteResMX<"WriteVSTE", [SiFiveP600VST], mx, IsWorstCase>;
561+
}
562+
563+
// Mask load and store always have EMUL=1.
564+
let ReleaseAtCycles = [SiFiveP600GetLMulCycles<"M1">.c] in {
565+
defm "" : LMULWriteResMX<"WriteVLDM", [SiFiveP600VLD], mx, IsWorstCase=!eq(mx,"M1")>;
566+
defm "" : LMULWriteResMX<"WriteVSTM", [SiFiveP600VST], mx, IsWorstCase=!eq(mx,"M1")>;
567+
}
588568
}
589-
let Latency = 12, ReleaseAtCycles = [LMulLat] in {
590-
defm "" : LMULWriteResMX<"WriteVSTUX8", [SiFiveP600VST], mx, IsWorstCase>;
591-
defm "" : LMULWriteResMX<"WriteVSTUX16", [SiFiveP600VST], mx, IsWorstCase>;
592-
defm "" : LMULWriteResMX<"WriteVSTUX32", [SiFiveP600VST], mx, IsWorstCase>;
593-
defm "" : LMULWriteResMX<"WriteVSTUX64", [SiFiveP600VST], mx, IsWorstCase>;
594-
defm "" : LMULWriteResMX<"WriteVSTOX8", [SiFiveP600VST], mx, IsWorstCase>;
595-
defm "" : LMULWriteResMX<"WriteVSTOX16", [SiFiveP600VST], mx, IsWorstCase>;
596-
defm "" : LMULWriteResMX<"WriteVSTOX32", [SiFiveP600VST], mx, IsWorstCase>;
597-
defm "" : LMULWriteResMX<"WriteVSTOX64", [SiFiveP600VST], mx, IsWorstCase>;
569+
foreach eew = [8, 16, 32, 64] in {
570+
let Latency = 13, ReleaseAtCycles = [SiFiveP600GetVLMAX<mx, eew>.val] in {
571+
defm "" : LMULWriteResMX<"WriteVLDS" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
572+
defm "" : LMULWriteResMX<"WriteVLDUX" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
573+
defm "" : LMULWriteResMX<"WriteVLDOX" # eew, [SiFiveP600VLD], mx, IsWorstCase>;
574+
575+
defm "" : LMULWriteResMX<"WriteVSTS" # eew, [SiFiveP600VST], mx, IsWorstCase>;
576+
defm "" : LMULWriteResMX<"WriteVSTUX" # eew, [SiFiveP600VST], mx, IsWorstCase>;
577+
defm "" : LMULWriteResMX<"WriteVSTOX" # eew, [SiFiveP600VST], mx, IsWorstCase>;
578+
}
598579
}
599580
}
600581

601582
foreach mx = SchedMxList in {
602583
foreach nf=2-8 in {
603584
foreach eew = [8, 16, 32, 64] in {
604-
defvar LMulLat = SiFiveP600GetCyclesSegmented<mx, eew, nf>.c;
585+
defvar LMulLat = SiFiveP600SegmentedLdStCycles<mx, eew, nf>.c;
605586
defvar IsWorstCase = SiFiveP600IsWorstCaseMX<mx, SchedMxList>.c;
606587
let Latency = !add(12, LMulLat), ReleaseAtCycles = [!add(12, LMulLat)] in {
607588
defm "" : LMULWriteResMX<"WriteVLSEG" # nf # "e" # eew, [SiFiveP600VLD], mx, IsWorstCase>;

0 commit comments

Comments
 (0)