-
Notifications
You must be signed in to change notification settings - Fork 15.3k
MTM: improve operand latency when missing sched info #101389
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Closed
Closed
Changes from all commits
Commits
Show all changes
2 commits
Select commit
Hold shift + click to select a range
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,7 @@ | |
| #include "llvm/CodeGen/MachineLoopInfo.h" | ||
| #include "llvm/CodeGen/MachineOperand.h" | ||
| #include "llvm/CodeGen/MachineRegisterInfo.h" | ||
| #include "llvm/CodeGen/TargetInstrInfo.h" | ||
| #include "llvm/CodeGen/TargetRegisterInfo.h" | ||
| #include "llvm/CodeGen/TargetSchedule.h" | ||
| #include "llvm/CodeGen/TargetSubtargetInfo.h" | ||
|
|
@@ -761,6 +762,59 @@ static void updatePhysDepsDownwards(const MachineInstr *UseMI, | |
| } | ||
| } | ||
|
|
||
| /// Estimates the number of cycles elapsed between DefMI and UseMI, DefMI | ||
| /// inclusive and UseMI exclusive, if they're in the same MBB. Returns | ||
| /// std::nullopt if they're in different MBBs, and 0 if UseMI is null. | ||
| static std::optional<unsigned> | ||
| estimateDefUseCycles(const TargetSchedModel &Sched, const MachineInstr *DefMI, | ||
| const MachineInstr *UseMI) { | ||
| if (!UseMI) | ||
| return 0; | ||
| if (DefMI->getParent() != UseMI->getParent()) | ||
| return std::nullopt; | ||
|
|
||
| const auto DefIt = DefMI->getIterator(); | ||
| const auto UseIt = UseMI->getIterator(); | ||
|
|
||
| unsigned NumMicroOps = 0; | ||
| for (auto It = DefIt; It != UseIt; ++It) { | ||
| // In cases where the UseMI is a PHI at the beginning of the MBB, compute | ||
| // MicroOps until the end of the MBB. | ||
| if (It.isEnd()) | ||
| break; | ||
|
|
||
| NumMicroOps += Sched.getNumMicroOps(&*It); | ||
| } | ||
| return NumMicroOps / Sched.getIssueWidth(); | ||
| } | ||
|
|
||
| /// Wraps Sched.computeOperandLatency, accounting for the case when | ||
| /// InstrSchedModel and InstrItineraries are not available: in this case, | ||
| /// Sched.computeOperandLatency returns DefaultDefLatency, which is a very rough | ||
| /// approximate; to improve this approximate, offset it by the approximate | ||
| /// cycles elapsed from DefMI to UseMI (since the MIs could be re-ordered by the | ||
| /// scheduler, and we don't have this information, this cannot be known | ||
| /// exactly). When scheduling information is available, | ||
artagnon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| /// Sched.computeOperandLatency returns a much better estimate (especially if | ||
| /// UseMI is non-null), so we just return that. | ||
| static unsigned computeOperandLatency(const TargetSchedModel &Sched, | ||
artagnon marked this conversation as resolved.
Show resolved
Hide resolved
artagnon marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| const MachineInstr *DefMI, | ||
| unsigned DefOperIdx, | ||
| const MachineInstr *UseMI, | ||
| unsigned UseOperIdx) { | ||
| assert(DefMI && "Non-null DefMI expected"); | ||
| if (!Sched.hasInstrSchedModel() && !Sched.hasInstrItineraries()) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Instead of placing this in this wrapper, could the default implementation of computeOperandLatency handle this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| unsigned DefaultDefLatency = Sched.getInstrInfo()->defaultDefLatency( | ||
michaelmaitland marked this conversation as resolved.
Show resolved
Hide resolved
michaelmaitland marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| *Sched.getMCSchedModel(), *DefMI); | ||
| std::optional<unsigned> DefUseCycles = | ||
| estimateDefUseCycles(Sched, DefMI, UseMI); | ||
| if (!DefUseCycles || DefaultDefLatency <= DefUseCycles) | ||
| return 0; | ||
| return DefaultDefLatency - *DefUseCycles; | ||
| } | ||
| return Sched.computeOperandLatency(DefMI, DefOperIdx, UseMI, UseOperIdx); | ||
| } | ||
|
|
||
| /// The length of the critical path through a trace is the maximum of two path | ||
| /// lengths: | ||
| /// | ||
|
|
@@ -813,8 +867,8 @@ updateDepth(MachineTraceMetrics::TraceBlockInfo &TBI, const MachineInstr &UseMI, | |
| unsigned DepCycle = Cycles.lookup(Dep.DefMI).Depth; | ||
| // Add latency if DefMI is a real instruction. Transients get latency 0. | ||
| if (!Dep.DefMI->isTransient()) | ||
| DepCycle += MTM.SchedModel | ||
| .computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, Dep.UseOp); | ||
| DepCycle += computeOperandLatency(MTM.SchedModel, Dep.DefMI, Dep.DefOp, | ||
| &UseMI, Dep.UseOp); | ||
| Cycle = std::max(Cycle, DepCycle); | ||
| } | ||
| // Remember the instruction depth. | ||
|
|
@@ -929,8 +983,8 @@ static unsigned updatePhysDepsUpwards(const MachineInstr &MI, unsigned Height, | |
| if (!MI.isTransient()) { | ||
| // We may not know the UseMI of this dependency, if it came from the | ||
| // live-in list. SchedModel can handle a NULL UseMI. | ||
| DepHeight += SchedModel.computeOperandLatency(&MI, MO.getOperandNo(), | ||
| I->MI, I->Op); | ||
| DepHeight += computeOperandLatency(SchedModel, &MI, MO.getOperandNo(), | ||
| I->MI, I->Op); | ||
| } | ||
| Height = std::max(Height, DepHeight); | ||
| // This regunit is dead above MI. | ||
|
|
@@ -963,10 +1017,9 @@ static bool pushDepHeight(const DataDep &Dep, const MachineInstr &UseMI, | |
| unsigned UseHeight, MIHeightMap &Heights, | ||
| const TargetSchedModel &SchedModel, | ||
| const TargetInstrInfo *TII) { | ||
| // Adjust height by Dep.DefMI latency. | ||
| if (!Dep.DefMI->isTransient()) | ||
| UseHeight += SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, &UseMI, | ||
| Dep.UseOp); | ||
| UseHeight += computeOperandLatency(SchedModel, Dep.DefMI, Dep.DefOp, &UseMI, | ||
| Dep.UseOp); | ||
|
|
||
| // Update Heights[DefMI] to be the maximum height seen. | ||
| MIHeightMap::iterator I; | ||
|
|
@@ -1192,8 +1245,8 @@ MachineTraceMetrics::Trace::getPHIDepth(const MachineInstr &PHI) const { | |
| unsigned DepCycle = getInstrCycles(*Dep.DefMI).Depth; | ||
| // Add latency if DefMI is a real instruction. Transients get latency 0. | ||
| if (!Dep.DefMI->isTransient()) | ||
| DepCycle += TE.MTM.SchedModel.computeOperandLatency(Dep.DefMI, Dep.DefOp, | ||
| &PHI, Dep.UseOp); | ||
| DepCycle += computeOperandLatency(TE.MTM.SchedModel, Dep.DefMI, Dep.DefOp, | ||
| &PHI, Dep.UseOp); | ||
| return DepCycle; | ||
| } | ||
|
|
||
|
|
||
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I wonder if it would be useful to use
defaultDefLatencyfor the instructions we are iterating over here in the case that some of the instructions we are iterating over depend on eachother. Imagine a scenario:In this case, what if we checked defaultDefLatency of a and used it to understand the number of cycles elapsed between a and b? For example, if
ahas a default latency of 10, then b can't really start in the nextNumMicroOps / IssueWidthcycles, since it has to wait 10 additional cycles.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sorry, I'm having trouble understanding your example. Here's what I think I understand:
Here, if c has a default def latency of N cycles, a has that of M cycles, b can start after M - 1 cycles, and d can start after N - 3 cycles, assuming issue-width = num-micro-ops = 1. What do you mean by "instructions that depends on each other"? How can one instruction depend on another, which in turn depends on the first? Wouldn't this break basic dominance criteria? Also, if I understand correctly, I think we're in SSA form at this point, so we don't have to worry about re-definitions.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We have a loop that iterates over instructions between
DefMIandUseMI:I suggest the following scenario:
In this case, we will be looping over instructions
aandband adding their number of micro ops to calculate the number of cycles elapsed betweendefmiandusemi.Let's take the assumption that
ahas default def latency of N cycles andbhas default latency of M cycles.In my scenario,
buses the result ofa. It cannot start until that result is ready (an extra N cycles). If we want to make it concrete, we could imagine that it looks like this:We cannot start the subtraction until
ais finished calculating. This is what I am calling a dependency. We can assume for sake of simplicity here thataandbare independent fromdefmiandusemi.In this scenario, I am suggesting that if the default latency of a is larger than the number of micro-ops, then we must wait at least the default latency of
abefore startingb. I suggest that we can incorporate this into the estimation.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah, thanks for the explanation! However, I think we're missing the fact that CPUs are usually pipelined, and the defmi-usemi dependency will be in one pipeline, while the a-b dependency will be in another pipeline. Hence, I think the defmi-usemi dependency should be independent of the a-b dependency. When the code is called with DefMI = a, and UseMI = b, it will return the correct answer for that dependency.
Now, we haven't actually modeled any pipelines, but do you think this is feasible?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is a good point that if there are multiple pipelines and the defmi-usemi instructions goes down one and the a-b goes down another, then what you have is correct. There definitely is also a scenario where these dependency pairs go down the same pipeline and in that case what I am suggesting is probably the better model.
Unfortunately we don't have any pipeline information because we don't have the scheduler model, so we don't actually know what we should do.
One argument is to keep what we have no because its simple and less expensive to compute.
Another argument is to pick the "more common" approach. I'd prefer not to make a blanket statement and say that "it is more likely for independent instructions to go down different pipelines", although I wouldn't be surprised if this was the case.
For these reasons, I am content with the approach you are proposing. Happy to see if anyone else has thoughts on this.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I may have missed something but the use of microops is worrying me - many arch don't guarantee that uop and latency are a close match (e.g. alderlake divpd ymm uops=1 latency=15). And then dividing by issuewidth makes it feel more like a throughput estimate than latency?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see. Will adderlake divpd not dispatch in one cycle though? We're subtracting DefaultDefLatency by number of cycles elapsed between the DefMI and UseMI. If there's a adderlake divpd between the DefMI and UseMI, should we be subtracting 15 cycles for it?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@RKSimon do you still have concerns about use of micro-ops and issue width here?
I think it makes sense to use issue width in determining "the number of cycles between two instructions" in the calculation here. For example:
If the issue width is 1, then
ais issued in once cycle, andbthe next. But if the issue width is2thenaandbare issued in the same cycle. In the former case we should estimate 2 cycles between[a, c)and in the latter estimate 1 cycle.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Doesn't this makes assumptions that the ops can be issued on any pipe? I'm still not convinced your approach makes sense tbh.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since we don't have the scheduling model, we could either assume that all ops are issued on the same pipeline, or that independent ops are always issued on different pipelines. I think the latter case is more common. The patch is a first-order improvement over DefaultDefLatency: it is by no means an accurate representation of how the machine functions, but wouldn't you agree that the patch is an improvement over DefaultDefLatency?
Of course, if you have a better idea concerning how to improve to latency computation in the fallback case, please do suggest.