-
Notifications
You must be signed in to change notification settings - Fork 15.2k
[AMDGPU][SIInsertWaitCnts] Gfx12.5 - Refactor xcnt optimization #164357
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 10 commits
4df30af
0ac18c1
d87068d
cca7a3e
44466c2
4770a1e
64fb73e
2683aa7
a1ee5b8
7708bb6
02bfb8c
935fba6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -633,8 +633,11 @@ class WaitcntBrackets { | |
| const MachineOperand &Op) const; | ||
|
|
||
| bool counterOutOfOrder(InstCounterType T) const; | ||
| void simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const; | ||
| void simplifyWaitcnt(AMDGPU::Waitcnt &Wait); | ||
| void simplifyWaitcnt(InstCounterType T, unsigned &Count) const; | ||
| bool hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait); | ||
| bool canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait); | ||
| void simplifyXcnt(AMDGPU::Waitcnt &CheckWait, AMDGPU::Waitcnt &UpdateWait); | ||
|
|
||
| void determineWait(InstCounterType T, RegInterval Interval, | ||
| AMDGPU::Waitcnt &Wait) const; | ||
|
|
@@ -646,7 +649,6 @@ class WaitcntBrackets { | |
|
|
||
| void applyWaitcnt(const AMDGPU::Waitcnt &Wait); | ||
| void applyWaitcnt(InstCounterType T, unsigned Count); | ||
| void applyXcnt(const AMDGPU::Waitcnt &Wait); | ||
| void updateByEvent(WaitEventType E, MachineInstr &MI); | ||
|
|
||
| unsigned hasPendingEvent() const { return PendingEvents; } | ||
|
|
@@ -1192,15 +1194,15 @@ void WaitcntBrackets::print(raw_ostream &OS) const { | |
|
|
||
| /// Simplify the waitcnt, in the sense of removing redundant counts, and return | ||
| /// whether a waitcnt instruction is needed at all. | ||
| void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const { | ||
| void WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) { | ||
| simplifyWaitcnt(LOAD_CNT, Wait.LoadCnt); | ||
| simplifyWaitcnt(EXP_CNT, Wait.ExpCnt); | ||
| simplifyWaitcnt(DS_CNT, Wait.DsCnt); | ||
| simplifyWaitcnt(STORE_CNT, Wait.StoreCnt); | ||
| simplifyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); | ||
| simplifyWaitcnt(BVH_CNT, Wait.BvhCnt); | ||
| simplifyWaitcnt(KM_CNT, Wait.KmCnt); | ||
| simplifyWaitcnt(X_CNT, Wait.XCnt); | ||
| simplifyXcnt(Wait, Wait); | ||
| } | ||
|
|
||
| void WaitcntBrackets::simplifyWaitcnt(InstCounterType T, | ||
|
|
@@ -1270,7 +1272,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) { | |
| applyWaitcnt(SAMPLE_CNT, Wait.SampleCnt); | ||
| applyWaitcnt(BVH_CNT, Wait.BvhCnt); | ||
| applyWaitcnt(KM_CNT, Wait.KmCnt); | ||
| applyXcnt(Wait); | ||
| applyWaitcnt(X_CNT, Wait.XCnt); | ||
| } | ||
|
|
||
| void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { | ||
|
|
@@ -1287,41 +1289,41 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) { | |
| } | ||
| } | ||
|
|
||
| void WaitcntBrackets::applyXcnt(const AMDGPU::Waitcnt &Wait) { | ||
| // On entry to a block with multiple predescessors, there may | ||
| // be pending SMEM and VMEM events active at the same time. | ||
| // In such cases, only clear one active event at a time. | ||
| auto applyPendingXcntGroup = [this](unsigned E) { | ||
| unsigned LowerBound = getScoreLB(X_CNT); | ||
| applyWaitcnt(X_CNT, 0); | ||
| PendingEvents |= (1 << E); | ||
| setScoreLB(X_CNT, LowerBound); | ||
| }; | ||
|
|
||
| bool WaitcntBrackets::hasRedundantXCntWithKmCnt(const AMDGPU::Waitcnt &Wait) { | ||
| // Wait on XCNT is redundant if we are already waiting for a load to complete. | ||
| // SMEM can return out of order, so only omit XCNT wait if we are waiting till | ||
| // zero. | ||
| if (Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP)) { | ||
| if (hasPendingEvent(VMEM_GROUP)) | ||
| applyPendingXcntGroup(VMEM_GROUP); | ||
| else | ||
| applyWaitcnt(X_CNT, 0); | ||
| return; | ||
| } | ||
| return Wait.KmCnt == 0 && hasPendingEvent(SMEM_GROUP); | ||
| } | ||
|
|
||
| bool WaitcntBrackets::canOptimizeXCntWithLoadCnt(const AMDGPU::Waitcnt &Wait) { | ||
| // If we have pending store we cannot optimize XCnt because we do not wait for | ||
| // stores. VMEM loads retun in order, so if we only have loads XCnt is | ||
| // decremented to the same number as LOADCnt. | ||
| if (Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && | ||
| !hasPendingEvent(STORE_CNT)) { | ||
| if (hasPendingEvent(SMEM_GROUP)) | ||
| applyPendingXcntGroup(SMEM_GROUP); | ||
| else | ||
| applyWaitcnt(X_CNT, std::min(Wait.XCnt, Wait.LoadCnt)); | ||
| return; | ||
| } | ||
| return Wait.LoadCnt != ~0u && hasPendingEvent(VMEM_GROUP) && | ||
| !hasPendingEvent(STORE_CNT); | ||
| } | ||
|
|
||
| applyWaitcnt(X_CNT, Wait.XCnt); | ||
| void WaitcntBrackets::simplifyXcnt(AMDGPU::Waitcnt &CheckWait, | ||
| AMDGPU::Waitcnt &UpdateWait) { | ||
| // Try to simplify xcnt further by checking for joint kmcnt and loadcnt | ||
| // optimizations. On entry to a block with multiple predescessors, there may | ||
| // be pending SMEM and VMEM events active at the same time. | ||
| // In such cases, only clear one active event at a time. | ||
| if (hasRedundantXCntWithKmCnt(CheckWait)) { | ||
| if (!hasMixedPendingEvents(X_CNT)) { | ||
| applyWaitcnt(X_CNT, 0); | ||
| } else { | ||
| PendingEvents &= ~(1 << SMEM_GROUP); | ||
| } | ||
| } else if (canOptimizeXCntWithLoadCnt(CheckWait)) { | ||
| if (!hasMixedPendingEvents(X_CNT)) { | ||
| applyWaitcnt(X_CNT, std::min(CheckWait.XCnt, CheckWait.LoadCnt)); | ||
| } else if (CheckWait.LoadCnt == 0) { | ||
| PendingEvents &= ~(1 << VMEM_GROUP); | ||
| } | ||
| } | ||
| simplifyWaitcnt(X_CNT, UpdateWait.XCnt); | ||
| } | ||
|
|
||
| // Where there are multiple types of event in the bracket of a counter, | ||
|
|
@@ -1656,6 +1658,8 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( | |
| } | ||
| } | ||
|
|
||
| // Save the pre combine waitcnt in order to make xcnt checks. | ||
| AMDGPU::Waitcnt PreCombine = Wait; | ||
| if (CombinedLoadDsCntInstr) { | ||
| // Only keep an S_WAIT_LOADCNT_DSCNT if both counters actually need | ||
| // to be waited for. Otherwise, let the instruction be deleted so | ||
|
|
@@ -1746,6 +1750,13 @@ bool WaitcntGeneratorGFX12Plus::applyPreexistingWaitcnt( | |
| } | ||
|
|
||
| for (auto CT : inst_counter_types(NUM_EXTENDED_INST_CNTS)) { | ||
| if ((CT == KM_CNT && ScoreBrackets.hasRedundantXCntWithKmCnt(PreCombine)) || | ||
| (CT == LOAD_CNT && | ||
| ScoreBrackets.canOptimizeXCntWithLoadCnt(PreCombine))) { | ||
| // Xcnt may need to be updated depending on a pre-existing KM/LOAD_CNT | ||
| // due to taking the backedge of a block. | ||
| ScoreBrackets.simplifyXcnt(PreCombine, Wait); | ||
| } | ||
| if (!WaitInstrs[CT]) | ||
| continue; | ||
|
|
||
|
|
@@ -2092,6 +2103,14 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(MachineInstr &MI, | |
| // Verify that the wait is actually needed. | ||
| ScoreBrackets.simplifyWaitcnt(Wait); | ||
|
|
||
| // Since the translation for VMEM addresses occur in-order, we can apply the | ||
| // XCnt if the current instruction is of VMEM type and has a memory | ||
| // dependency with another VMEM instruction in flight. | ||
| if (Wait.XCnt != ~0u && isVmemAccess(MI)) { | ||
| ScoreBrackets.applyWaitcnt(X_CNT, Wait.XCnt); | ||
| Wait.XCnt = ~0u; | ||
| } | ||
|
|
||
| // When forcing emit, we need to skip terminators because that would break the | ||
| // terminators of the MBB if we emit a waitcnt between terminators. | ||
| if (ForceEmitZeroFlag && !MI.isTerminator()) | ||
|
|
@@ -2160,21 +2179,6 @@ bool SIInsertWaitcnts::generateWaitcnt(AMDGPU::Waitcnt Wait, | |
| << "Update Instr: " << *It); | ||
| } | ||
|
|
||
| // XCnt may be already consumed by a load wait. | ||
| if (Wait.XCnt != ~0u) { | ||
| if (Wait.KmCnt == 0 && !ScoreBrackets.hasPendingEvent(SMEM_GROUP)) | ||
| Wait.XCnt = ~0u; | ||
|
|
||
| if (Wait.LoadCnt == 0 && !ScoreBrackets.hasPendingEvent(VMEM_GROUP)) | ||
| Wait.XCnt = ~0u; | ||
|
|
||
| // Since the translation for VMEM addresses occur in-order, we can skip the | ||
| // XCnt if the current instruction is of VMEM type and has a memory | ||
| // dependency with another VMEM instruction in flight. | ||
| if (isVmemAccess(*It)) | ||
| Wait.XCnt = ~0u; | ||
| } | ||
|
|
||
|
Comment on lines
-2157
to
-2171
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nhaehnle @jayfoad Thoughts on this case (see changes in bf16.ll)? - Previously the xcnt was not waited on because of the check on L2168, but now it also requires
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is waiting for #166779 though (currently just includes Jay's fix in this PR.)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could also just have this one supersede it, up to you Jay.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Apparently there was a late change to gfx1250 that made it so every s_wait_loadcnt (and storecnt) also waits for the equivalent xcnt value. We need to get some clarity on that internally.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Interesting. I also see that hardware waits for xcnt==0 at every branch/call, which probably means that SIInsertWaitcnts does not after all have to handle a mixture of pending SMEM Xcnts and VMEM Xcnts.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added a todo to revisit. |
||
| if (WCG->createNewWaitcnt(Block, It, Wait)) | ||
| Modified = true; | ||
|
|
||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.