-
Notifications
You must be signed in to change notification settings - Fork 15.5k
[RISCV] Update SpacemiT-X60 vector floating-point instructions latencies #150618
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 7 commits
17cd357
a9a540b
603a72c
0e41d47
976ba6a
b206ab3
b15d15e
07131e0
b86c593
8ddd296
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -104,6 +104,11 @@ class Get461018Latency<string mx> { | |
| int c = GetLMULValue<[/*MF8=*/4, /*MF4=*/4, /*MF2=*/4, /*M1=*/4, /*M2=*/6, /*M4=*/10, /*M8=*/18], mx>.c; | ||
| } | ||
|
|
||
| // Used for: FP FMA operations, complex FP ops | ||
| class Get6678Latency<string mx> { | ||
| int c = GetLMULValue<[/*MF8=*/6, /*MF4=*/6, /*MF2=*/6, /*M1=*/6, /*M2=*/6, /*M4=*/7, /*M8=*/8], mx>.c; | ||
| } | ||
|
|
||
| //===----------------------------------------------------------------------===// | ||
|
|
||
| class SMX60IsWorstCaseMX<string mx, list<string> MxList> { | ||
|
|
@@ -541,83 +546,169 @@ foreach mx = SchedMxListF in { | |
| foreach sew = SchedSEWSet<mx, isF=1>.val in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; | ||
|
|
||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| } | ||
| defvar VFALULat = Get4458Latency<mx>.c; | ||
| defvar VFALUOcc = ConstOneUntilM1ThenDouble<mx>.c; | ||
| let Latency = VFALULat, ReleaseAtCycles = [VFALUOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFALUV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFALUF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| foreach mx = SchedMxListF in { | ||
| foreach sew = SchedSEWSet<mx, isF=1>.val in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, isF=1>.c; | ||
| // Slightly increased latency for sew == 64 | ||
| defvar VFMulVLatAndOcc = !if(!eq(sew, 64), ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c, | ||
| Get4458Latency<mx>.c); | ||
| let Latency = VFMulVLatAndOcc, ReleaseAtCycles = [VFMulVLatAndOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| // VFMulF has the same latency as VFMulV, but slighlty lower ReleaseAtCycles | ||
| let Latency = VFMulVLatAndOcc, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMinMaxF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defvar VFSgnjLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; | ||
| defvar VFSgnjOcc = ConstOneUntilMF2ThenDouble<mx>.c; | ||
| let Latency = VFSgnjLat, ReleaseAtCycles = [VFSgnjOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFRecpV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFSgnjF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| // The following covers vfmacc, vfmsac, and their vfn* variants in the same group, but the | ||
| // ReleaseAtCycles takes one extra cycle for the vfn* variants. | ||
| // TODO: Should we split them? | ||
| // TODO: for some reason, the following cond is not working, and always use ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this TODO still valid?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unfortunately, yes. You can see in rvv-fp.s. For sew == 64, it should be picking 6678, but it stays in 5558 like the other condition. I've tried to debug this some months ago, but got nowhere. The diff is small tho. |
||
| defvar VFMulAddLatency = !if(!eq(sew, 64), | ||
| Get6678Latency<mx>.c, | ||
| ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c | ||
| ); | ||
| let Latency = VFMulAddLatency, ReleaseAtCycles = [ConstOneUntilM1ThenDouble<mx>.c] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| foreach mx = SchedMxList in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxList>.c; | ||
|
|
||
| defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; | ||
| // Slightly increased ReleaseAtCycles for M8: 18 | ||
| defvar VFCmpOcc = !if(!eq(mx, "M8"), | ||
| !add(ConstOneUntilMF2ThenDouble<mx>.c, 2), | ||
| ConstOneUntilMF2ThenDouble<mx>.c | ||
| ); | ||
| let Latency = Get461018Latency<mx>.c, ReleaseAtCycles = [VFCmpOcc] in { | ||
| defm "" : LMULWriteResMX<"WriteVFCmpV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFCmpF", [SMX60_VFP], mx, IsWorstCase>; | ||
| } | ||
|
|
||
| defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defvar VFClassLat = ConstValueUntilLMULThenDouble<"M2", 4, mx>.c; | ||
| defvar VFClassOcc = ConstOneUntilMF2ThenDouble<mx>.c; | ||
| let Latency = VFClassLat, ReleaseAtCycles = [VFClassOcc] in { | ||
| defm "" : LMULWriteResMX<"WriteVFClassV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFMergeV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFMovV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defm "" : LMULWriteResMX<"WriteVFCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; | ||
| } | ||
| } | ||
|
|
||
| // Widening | ||
| foreach mx = SchedMxListW in { | ||
| foreach sew = SchedSEWSet<mx, isF=0, isWidening=1>.val in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListW>.c; | ||
|
|
||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defvar VFWCvtILat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; | ||
| defvar VFWCvtIOcc = ConstOneUntilMF4ThenDouble<mx>.c; | ||
| let Latency = VFWCvtILat, ReleaseAtCycles = [VFWCvtIOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| foreach mx = SchedMxListFW in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListFW>.c; | ||
|
|
||
| defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defvar VFWCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; | ||
| defvar VFWCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c; | ||
| let Latency = VFWCvtFToIVLat, ReleaseAtCycles = [VFWCvtFToIVOcc] in { | ||
| defm "" : LMULWriteResMX<"WriteVFWCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; | ||
| } | ||
| } | ||
|
|
||
| foreach mx = SchedMxListFW in { | ||
| foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; | ||
|
|
||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defvar VFWCvtFToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; | ||
| defvar VFWCvtFToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c; | ||
| let Latency = VFWCvtFToFVLat, ReleaseAtCycles = [VFWCvtFToFVOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| // Latency for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 4/4/4/5/8 | ||
| // ReleaseAtCycles for vfwsub/vfwadd.vv, vfwsub/vfwadd.vf: 1/1/2/4/8 | ||
| // Latency for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 5/5/5/9/17 | ||
| // ReleaseAtCycles for vfwsub/vfwadd.wv, vfwsub/vfwadd.wf: 1/2/4/8/17 | ||
| // We use the worst-case | ||
| defvar VFWALULat = !add(ConstValueUntilLMULThenDouble<"M1", 4, mx>.c, 1); // 5/5/9/17 | ||
| defvar VFWALUOcc = !if(!eq(mx, "M4"), | ||
| !add(ConstOneUntilMF4ThenDouble<mx>.c, 1), // 2/4/8/17 | ||
| ConstOneUntilMF4ThenDouble<mx>.c | ||
| ); | ||
| // TODO: Split .wf/.wv variants into separate scheduling classes | ||
| let Latency = VFWALULat, ReleaseAtCycles = [VFWALUOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWALUF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| let Latency = Get4588Latency<mx>.c, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| // Slightly increased latency for SEW == 32 | ||
| defvar VFWMullOcc = !if(!eq(sew, 32), | ||
| GetLMULValue<[1, 1, 1, 3, 5, 9, 18], mx>.c, | ||
| ConstOneUntilMF2ThenDouble<mx>.c | ||
| ); | ||
| defvar VFWMulVLat = ConstValueUntilLMULThenDoubleBase<"M8", 5, 8, mx>.c; | ||
| let Latency = VFWMulVLat, ReleaseAtCycles = [VFWMullOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| // Latency for vfwmacc, vfwnmacc, etc: e16 = 5/5/5/8; e32 = 6/6/7/8 | ||
| defvar VFWMulAddVLat = !if(!eq(sew, 16), | ||
| ConstValueUntilLMULThenDoubleBase<"M4", 5, 8, mx>.c, | ||
| Get6678Latency<mx>.c | ||
| ); | ||
| let Latency = VFWMulAddVLat, ReleaseAtCycles = [ConstOneUntilMF2ThenDouble<mx>.c] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFWMulAddF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Narrowing | ||
| foreach mx = SchedMxListW in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMX<mx, SchedMxListW>.c; | ||
|
|
||
| defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; | ||
| defvar VFNCvtFToIVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; | ||
| defvar VFNCvtFToIVOcc = ConstOneUntilMF4ThenDouble<mx>.c; | ||
| let Latency = VFNCvtFToIVLat, ReleaseAtCycles = [VFNCvtFToIVOcc] in { | ||
| defm "" : LMULWriteResMX<"WriteVFNCvtFToIV", [SMX60_VFP], mx, IsWorstCase>; | ||
| } | ||
| } | ||
|
|
||
| foreach mx = SchedMxListFW in { | ||
| foreach sew = SchedSEWSet<mx, isF=1, isWidening=1>.val in { | ||
|
|
||
| defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListFW, isF=1>.c; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
|
|
||
| defvar VFNCvtToFVLat = ConstValueUntilLMULThenDouble<"M1", 4, mx>.c; | ||
| defvar VFNCvtToFVOcc = ConstOneUntilMF4ThenDouble<mx>.c; | ||
| let Latency = VFNCvtToFVLat, ReleaseAtCycles = [VFNCvtToFVOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtIToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFNCvtFToFV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -626,9 +717,35 @@ foreach mx = SchedMxListF in { | |
| foreach sew = SchedSEWSet<mx, 1>.val in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; | ||
|
|
||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| // Compute ReleaseAtCycles based on SEW | ||
| // Latency for vfdiv.vf: e16/e32 = 12/24/48/96; e64 = 18/36/72/144 | ||
| // Latency for vfrdiv.vf: e16/e32 = 12/24/48/96; e64 = 40/80/160/320 | ||
| // We use the worst-case, vfdiv.vf is penalized in e64 | ||
| // TODO: split vfdiv.vf and vfrdiv.vf into separate scheduling classes | ||
| defvar VFDivFFactor = !if(!eq(sew, 64), 40, 12); | ||
| defvar VFDivFLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivFFactor); | ||
| let Latency = VFDivFLatAndOcc, ReleaseAtCycles = [VFDivFLatAndOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFDivF", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
|
|
||
| defvar VFDivVFactor = !if(!eq(sew, 16), 12, 40); | ||
| defvar VFDivVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFDivVFactor); | ||
| let Latency = VFDivVLatAndOcc, ReleaseAtCycles = [VFDivVLatAndOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFDivV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Pattern for vfsqrt.v: e16 = 18/36/72/144; e32 = 38/76/152/304; e64 = 40/80/160/320 | ||
| foreach mx = SchedMxListF in { | ||
| foreach sew = SchedSEWSet<mx, 1>.val in { | ||
| defvar IsWorstCase = SMX60IsWorstCaseMXSEW<mx, sew, SchedMxListF, 1>.c; | ||
|
|
||
| defvar VFSqrtVFactor = !if(!eq(sew, 16), 12, 40); | ||
| defvar VFSqrtVLatAndOcc = !mul(ConstOneUntilM1ThenDouble<mx>.c, VFSqrtVFactor); | ||
| let Latency = VFSqrtVLatAndOcc, ReleaseAtCycles = [VFSqrtVLatAndOcc] in { | ||
| defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SMX60_VFP], mx, sew, IsWorstCase>; | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.