-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[RISCV] tt-ascalon-d8 vector scheduling #167066
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Conversation
Drive-by: additional tuning knobs. Partial scheduling model for vector operations.
|
@llvm/pr-subscribers-backend-risc-v Author: Petr Penzin (ppenzin) ChangesDrive-by: additional tuning knobs. Partial scheduling model for vector operations. Full diff: https://github.com/llvm/llvm-project/pull/167066.diff 2 Files Affected:
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index e86431f78f1ba..07f6a38c77897 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -633,6 +633,13 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8",
FeatureUnalignedVectorMem]),
[TuneNoDefaultUnroll,
TuneNLogNVRGather,
+ TuneOptimizedNF2SegmentLoadStore,
+ TuneOptimizedNF3SegmentLoadStore,
+ TuneOptimizedNF4SegmentLoadStore,
+ TuneOptimizedNF5SegmentLoadStore,
+ TuneOptimizedNF6SegmentLoadStore,
+ TuneOptimizedNF7SegmentLoadStore,
+ TuneOptimizedNF8SegmentLoadStore,
TuneOptimizedZeroStrideLoad,
TunePostRAScheduler]>;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
index da89e158f9839..973e55b5c53f8 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedTTAscalonD8.td
@@ -8,19 +8,85 @@
//===----------------------------------------------------------------------===//
+class AscalonIsWorstCaseMX<string mx, list<string> MxList> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ bit c = !eq(mx, LLMUL);
+}
+
+class AscalonIsWorstCaseMXSEW<string mx, int sew, list<string> MxList,
+ bit isF = 0> {
+ defvar LLMUL = LargestLMUL<MxList>.r;
+ defvar SSEW = SmallestSEW<mx, isF>.r;
+ bit c = !and(!eq(mx, LLMUL), !eq(sew, SSEW));
+}
+
+/// Cycle counts that scale with LMUL with LMUL=1 having the same latency as
+/// fractional LMULs
+class AscalonGetCyclesLMUL<string mx, int base> {
+ int c = !cond(
+ !eq(mx, "M1") : base,
+ !eq(mx, "M2") : !mul(base, 2),
+ !eq(mx, "M4") : !mul(base, 4),
+ !eq(mx, "M8") : !mul(base, 8),
+ !eq(mx, "MF2") : base,
+ !eq(mx, "MF4") : base,
+ !eq(mx, "MF8") : base
+ );
+}
+
+/// Linear LMUL scaling starting from smallest fractional LMUL
+class AscalonGetCyclesLMULFractional<string mx, int base> {
+ int c = !cond(
+ !eq(mx, "MF8") : base,
+ !eq(mx, "MF4") : !mul(base, 2),
+ !eq(mx, "MF2") : !mul(base, 4),
+ !eq(mx, "M1") : !mul(base, 8),
+ !eq(mx, "M2") : !mul(base, 16),
+ !eq(mx, "M4") : !mul(base, 32),
+ !eq(mx, "M8") : !mul(base, 64)
+ );
+}
+
+class AscalonGetCyclesDefault<string mx> {
+ int c = AscalonGetCyclesLMUL<mx, 1>.c;
+}
+
+class AscalonGetCyclesNarrowing<string mx> {
+ int c = !cond(
+ !eq(mx, "M1") : 4,
+ !eq(mx, "M2") : 8,
+ !eq(mx, "M4") : 16,
+ !eq(mx, "MF2") : 2,
+ !eq(mx, "MF4") : 1,
+ !eq(mx, "MF8") : 1
+ );
+}
+
+
+class AscalonGetCyclesDivOrSqrt<string mx, int sew> {
+ int c = !cond(
+ !eq(sew, 8) : AscalonGetCyclesLMUL<mx, 7>.c, // TODO not valid for fp
+ !eq(sew, 16) : AscalonGetCyclesLMUL<mx, 6>.c,
+ !eq(sew, 32) : AscalonGetCyclesLMUL<mx, 5>.c,
+ !eq(sew, 64) : AscalonGetCyclesLMUL<mx, 8>.c
+ );
+}
+
+//===----------------------------------------------------------------------===//
+
def TTAscalonD8Model : SchedMachineModel {
let IssueWidth = 8; // 8-way decode and dispatch
let MicroOpBufferSize = 256; // 256 micro-op re-order buffer
let LoadLatency = 4; // Optimistic load latency
let MispredictPenalty = 14; // Fetch + Decode/Rename/Dispatch + Branch
- let CompleteModel = 0;
+ let CompleteModel = false;
// TODO: supported, but haven't added scheduling info yet.
let UnsupportedFeatures = [HasStdExtZbkb, HasStdExtZbkc, HasStdExtZbkx,
HasStdExtZcmt, HasStdExtZknd, HasStdExtZkne,
HasStdExtZknh, HasStdExtZksed, HasStdExtZksh,
- HasStdExtZkr, HasVInstructions, HasVInstructionsI64];
+ HasStdExtZkr];
}
let SchedModel = TTAscalonD8Model in {
@@ -34,11 +100,17 @@ let BufferSize = 16 in {
def AscalonFXB : ProcResource<1>; // ALU, INT -> FP/VEC
def AscalonFXC : ProcResource<2>; // ALU, BR
def AscalonFXD : ProcResource<2>; // ALU
- def AscalonFP : ProcResource<2>;
- // TODO: two vector units with vector scheduling model.
+ def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>;
+ // FP
+ def AscalonFPA : ProcResource<1>; // Pipe A aslo handles FP/VEC -> INT
+ def AscalonFPB : ProcResource<1>;
+ def AscalonFP : ProcResGroup<[AscalonFPA, AscalonFPB]>;
+ // Vector
+ def AscalonVA : ProcResource<1>;
+ def AscalonVB : ProcResource<1>;
+ def AscalonV : ProcResGroup<[AscalonFPA, AscalonFPB]>;
}
-def AscalonFX : ProcResGroup<[AscalonFXA, AscalonFXB, AscalonFXC, AscalonFXD]>;
//===----------------------------------------------------------------------===//
@@ -316,10 +388,244 @@ def : ReadAdvance<ReadSHXADD32, 0>;
def : ReadAdvance<ReadSingleBit, 0>;
def : ReadAdvance<ReadSingleBitImm, 0>;
+//===----------------------------------------------------------------------===//
+// Vector
+
+// Configuration-Setting Instructions
+let Latency = 1 in {
+def : WriteRes<WriteVSETVLI, [AscalonV]>;
+def : WriteRes<WriteVSETIVLI, [AscalonV]>;
+}
+let Latency = 2 in {
+def : WriteRes<WriteVSETVL, [AscalonV]>;
+}
+
+// Vector Integer Arithmetic Instructions
+foreach mx = SchedMxList in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVIALUV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIALUI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICALUMI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVShiftI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMinMaxX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMulAddX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMergeI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIMovI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVICmpI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar Cycles = AscalonGetCyclesDivOrSqrt<mx, sew>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [AscalonFX, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIDivX", [AscalonFX, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVIWALUV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWALUI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVIWMulAddX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ }
+}
+// Narrowing
+foreach mx = SchedMxListW in {
+ defvar Cycles = AscalonGetCyclesNarrowing<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVNShiftV", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftX", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVNShiftI", [AscalonFX, AscalonV], mx, IsWorstCase>;
+ }
+}
+
+// Vector Floating-Point Instructions
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+foreach mx = SchedMxList in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>;
+ }
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFClassV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMergeV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFMovV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ }
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFCmpV", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ defm "" : LMULWriteResMX<"WriteVFCmpF", [AscalonFP, AscalonV], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, isF=1>.val in {
+ defvar Cycles = AscalonGetCyclesDivOrSqrt<mx, sew>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// Widening
+foreach mx = SchedMxListW in {
+ foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [AscalonFP, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListFW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in
+ defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>;
+}
+// Narrowing
+foreach mx = SchedMxListW in {
+ defvar Cycles = AscalonGetCyclesNarrowing<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxListW>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [AscalonFPA, AscalonV], mx, IsWorstCase>;
+ }
+}
+foreach mx = SchedMxListFW in {
+ foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in {
+ defvar Cycles = AscalonGetCyclesNarrowing<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [AscalonFXB, AscalonV], mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+// Vector Reduction Instructions
+foreach mx = SchedMxList in {
+ foreach sew = SchedSEWSet<mx>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxList>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIRedMinMaxV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+ }
+}
+
+foreach mx = SchedMxListWRed in {
+ foreach sew = SchedSEWSet<mx, 0, 1>.val in {
+ defvar Cycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListWRed>.c;
+ let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListF in {
+ foreach sew = SchedSEWSet<mx, 1>.val in {
+ defvar RedCycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c;
+ let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, RedCycles] in {
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+ defvar OrdRedCycles = AscalonGetCyclesLMUL<mx, 18>.c;
+ let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, OrdRedCycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+}
+
+foreach mx = SchedMxListFWRed in {
+ foreach sew = SchedSEWSet<mx, 1, 1>.val in {
+ defvar RedCycles = AscalonGetCyclesDefault<mx>.c;
+ defvar IsWorstCase = AscalonIsWorstCaseMXSEW<mx, sew, SchedMxListFWRed, 1>.c;
+ let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, RedCycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ defvar OrdRedCycles = AscalonGetCyclesLMUL<mx, 18>.c;
+ let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, OrdRedCycles] in
+ defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [AscalonFX, AscalonV],
+ mx, sew, IsWorstCase>;
+ }
+}
+
//===----------------------------------------------------------------------===//
// Unsupported extensions
defm : UnsupportedSchedQ;
-defm : UnsupportedSchedV;
defm : UnsupportedSchedZabha;
defm : UnsupportedSchedZbc;
defm : UnsupportedSchedZbkb;
|
|
This is far from complete, in addition to more latencies it also needs tests. I have some questions. Does max(ReleaseAtCycles) have to be less than Latency? Can AcquireAtCycles overlap for different pipelines? |
In most of the cases Latency should be no less than the largest occupancy / ReleaseAtCycles. But both MachineScheduler and MCA support cases where the occupancy is larger. Here is a more detailed explanation I made somewhere else: mshockwave/portfolio#13 (reply in thread)
I don't quite get the question, could you give an example? I just skimmed over your code and the AcquireAtCycles in it look pretty typical to me |
mshockwave
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
could you also add llvm-mca tests?
| foreach mx = SchedMxList in { | ||
| defvar Cycles = AscalonGetCyclesDefault<mx>.c; | ||
| defvar IsWorstCase = AscalonIsWorstCaseMX<mx, SchedMxList>.c; | ||
| let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles] in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just realized this is an out-of-order core: AcquireAtCycles is only meaningfully used by in-order scheduling in MachineScheduler. When doing in-order scheduling, MachineScheduler maintains a virtual timeline for each resource to keep track of their reservation / consumption -- e.g. resource X is reserved from cycle A to cycle B -- "A" and "B" in this case are AcquireAtCycle and ReleaseAtCycles, respectively. MachineScheduler doesn't do that for out-of-order cores, in which case they only care about the quantity of occupancy, namely, (ReleaseAtCycles - AcquireAtCycles). In other words, writing
AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, Cycles]
for out-of-order cores has the same effect as writing
ReleaseAtCycles = [1, !sub(Cycles, 1)]
w.r.t MachineScheduler.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks, I will do that.
Out of order cores don't necessarily need to chain acquire-at and release-at cycles.
| } | ||
|
|
||
| /// Linear LMUL scaling starting from smallest fractional LMUL | ||
| class AscalonGetCyclesLMULFractional<string mx, int base> { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Unused?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not anymore
Add as number of read advances, VExtV, and VRGatherVV cost function.
Co-authored-by: Craig Topper <[email protected]>
|
@mshockwave @topperc PTAL Model should be functionally complete, though there are some intentional inaccuracies, which I hope can be reworked at some point. For example, I haven't tried to get very precise with cycles breakdown for instructions using multiple functional units, mostly because micro op breakdown isn't accurate yet. Vector tests have been "borrowed" from another in-tree arch 😄 |
|
gentle ping |
yeah speaking of this, we really need a more elegant way to clean up -- potentially centralize -- all scheduling model tests. Otherwise it's really easy to miss test coverage. |
| def : WriteRes<WriteRdVLENB, [AscalonFXA]>; | ||
|
|
||
| // Configuration-Setting Instructions | ||
| let Latency = 1 in { |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
nit: default Latency is already 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
| defm "" : LMULWriteResMX<"WriteVLDE", [AscalonLS], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVLDFF", [AscalonLS], mx, IsWorstCase>; | ||
| } | ||
| let Latency = 1 in |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ditto remove Latency = 1
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
| @@ -0,0 +1,1009 @@ | |||
| # NOTE: Assertions have been autogenerated by utils/update_mca_test_checks.py | |||
| # RUN: llvm-mca -mtriple=riscv64 -mcpu=tt-ascalon-d8 -iterations=1 < %s | FileCheck %s | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
please use -instruction-tables=full for all new scheduling model tests.
Ditto for all other new test files.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Addressed
| @@ -633,6 +633,13 @@ def TENSTORRENT_ASCALON_D8 : RISCVProcessorModel<"tt-ascalon-d8", | |||
| FeatureUnalignedVectorMem]), | |||
| [TuneNoDefaultUnroll, | |||
| TuneNLogNVRGather, | |||
| TuneOptimizedNF2SegmentLoadStore, | |||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Adding these tunes should be in a separate PR?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can do either way, though I did call them out in the description. On the other hand maybe combine with instruction tables flag to existing tests.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moved out to #168800
Yeah, we do. There was discussion at the summit about potentially using UnifiedDB for this, thought it's support for vector constraints isn't complete. |
Remove redundant latencies of 1. Use `-instruction-tables=full` in llvm-mca test command lines.
🐧 Linux x64 Test Results
|
|
I've separated the tune flags out to #168800 |
Add the vector scheduling model for tt-ascalon-d8 and corresponding llvm-mca tests.
Drive-by: additional tuning knobs.