-
Notifications
You must be signed in to change notification settings - Fork 14.8k
[AMDGPU] Generate waterfall for calls with SGPR(inreg) argument #146997
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 32 commits
024cb4f
a2ae8c8
6709725
31bbc2b
502cb2c
7434975
d8befc9
c744d82
e3c1295
8bd9a66
79f8dd5
d0c6ecd
6af7b97
8543612
ad6a65d
ab0476f
f7a7ea1
5815b3c
fd01203
9e73fab
54cf1e6
3f5cab9
50f01fd
015026d
41cd451
892cc6a
0a1318f
2a810be
a43e0bd
8aeb5bf
46237f6
53542ec
8391ff3
8d43378
3ac45a6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -910,12 +910,18 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI, | |
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg) | ||
.add(MI.getOperand(1)); | ||
MI.getOperand(1).setReg(TmpReg); | ||
} else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), | ||
MI, MI.getDebugLoc())) { | ||
return true; | ||
} | ||
|
||
if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI, | ||
MI.getDebugLoc())) { | ||
I = std::next(I); | ||
MI.eraseFromParent(); | ||
return true; | ||
} | ||
return true; | ||
|
||
if (!SrcReg.isVirtual()) | ||
return true; | ||
Comment on lines
+923
to
+924
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand this early exit, the !SrcReg.isVirtual() has existing explicit handling just below here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @arsenm , the upcoming |
||
} | ||
if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) { | ||
SIInstrWorklist worklist; | ||
|
@@ -941,7 +947,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) { | |
if (PHISources.contains(MI)) | ||
return; | ||
Register DstReg = MI->getOperand(0).getReg(); | ||
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg); | ||
const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg); | ||
|
||
V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI, | ||
TRI->getRegSizeInBits(*DstRC)); | ||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -6861,13 +6861,10 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB, | |||||
// Emit the actual waterfall loop, executing the wrapped instruction for each | ||||||
// unique value of \p ScalarOps across all lanes. In the best case we execute 1 | ||||||
// iteration, in the worst case we execute 64 (once per lane). | ||||||
static void | ||||||
emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | ||||||
MachineRegisterInfo &MRI, | ||||||
MachineBasicBlock &LoopBB, | ||||||
MachineBasicBlock &BodyBB, | ||||||
const DebugLoc &DL, | ||||||
ArrayRef<MachineOperand *> ScalarOps) { | ||||||
static void emitLoadScalarOpsFromVGPRLoop( | ||||||
const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB, | ||||||
MachineBasicBlock &BodyBB, const DebugLoc &DL, | ||||||
ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) { | ||||||
MachineFunction &MF = *LoopBB.getParent(); | ||||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
const SIRegisterInfo *TRI = ST.getRegisterInfo(); | ||||||
|
@@ -6876,8 +6873,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | |||||
|
||||||
MachineBasicBlock::iterator I = LoopBB.begin(); | ||||||
Register CondReg; | ||||||
|
||||||
for (MachineOperand *ScalarOp : ScalarOps) { | ||||||
for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) { | ||||||
unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI); | ||||||
unsigned NumSubRegs = RegSize / 32; | ||||||
Register VScalarOp = ScalarOp->getReg(); | ||||||
|
@@ -6906,7 +6902,16 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | |||||
} | ||||||
|
||||||
// Update ScalarOp operand to use the SGPR ScalarOp. | ||||||
ScalarOp->setReg(CurReg); | ||||||
if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid()) | ||||||
ScalarOp->setReg(CurReg); | ||||||
else { | ||||||
// Insert into the same block of use | ||||||
BuildMI(*ScalarOp->getParent()->getParent(), | ||||||
ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY), | ||||||
PhySGPRs[Idx]) | ||||||
.addReg(CurReg); | ||||||
ScalarOp->setReg(PhySGPRs[Idx]); | ||||||
} | ||||||
ScalarOp->setIsKill(); | ||||||
} else { | ||||||
SmallVector<Register, 8> ReadlanePieces; | ||||||
|
@@ -6975,7 +6980,15 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII, | |||||
} | ||||||
|
||||||
// Update ScalarOp operand to use the SGPR ScalarOp. | ||||||
ScalarOp->setReg(SScalarOp); | ||||||
if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid()) | ||||||
ScalarOp->setReg(SScalarOp); | ||||||
else { | ||||||
BuildMI(*ScalarOp->getParent()->getParent(), | ||||||
ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY), | ||||||
|
||||||
PhySGPRs[Idx]) | ||||||
.addReg(SScalarOp); | ||||||
ScalarOp->setReg(PhySGPRs[Idx]); | ||||||
} | ||||||
ScalarOp->setIsKill(); | ||||||
} | ||||||
} | ||||||
|
@@ -7006,7 +7019,10 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |||||
ArrayRef<MachineOperand *> ScalarOps, | ||||||
MachineDominatorTree *MDT, | ||||||
MachineBasicBlock::iterator Begin = nullptr, | ||||||
MachineBasicBlock::iterator End = nullptr) { | ||||||
MachineBasicBlock::iterator End = nullptr, | ||||||
ArrayRef<Register> PhySGPRs = {}) { | ||||||
assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) && | ||||||
"Physical SGPRs must be empty or match the number of scalar operands"); | ||||||
MachineBasicBlock &MBB = *MI.getParent(); | ||||||
MachineFunction &MF = *MBB.getParent(); | ||||||
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>(); | ||||||
|
@@ -7091,7 +7107,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI, | |||||
} | ||||||
} | ||||||
|
||||||
emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps); | ||||||
emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps, | ||||||
PhySGPRs); | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Inserting a waterfall loop alters control flow and can affect debug info and profiling data. Ensure debug locations and profiling intrinsics are preserved or updated for accurate performance analysis. Copilot generated this review using guidance from repository custom instructions. Positive FeedbackNegative Feedback |
||||||
|
||||||
MachineBasicBlock::iterator First = RemainderBB->begin(); | ||||||
// Restore SCC | ||||||
|
@@ -7328,27 +7345,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI, | |||||
if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) { | ||||||
MachineOperand *Dest = &MI.getOperand(0); | ||||||
if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) { | ||||||
// Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | ||||||
// following copies, we also need to move copies from and to physical | ||||||
// registers into the loop block. | ||||||
unsigned FrameSetupOpcode = getCallFrameSetupOpcode(); | ||||||
unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode(); | ||||||
|
||||||
// Also move the copies to physical registers into the loop block | ||||||
MachineBasicBlock &MBB = *MI.getParent(); | ||||||
MachineBasicBlock::iterator Start(&MI); | ||||||
while (Start->getOpcode() != FrameSetupOpcode) | ||||||
--Start; | ||||||
MachineBasicBlock::iterator End(&MI); | ||||||
while (End->getOpcode() != FrameDestroyOpcode) | ||||||
++End; | ||||||
// Also include following copies of the return value | ||||||
++End; | ||||||
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | ||||||
MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr)) | ||||||
++End; | ||||||
CreatedBB = | ||||||
loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End); | ||||||
createWaterFall(&MI, MDT, {Dest}); | ||||||
} | ||||||
} | ||||||
|
||||||
|
@@ -7611,6 +7608,33 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI, | |||||
legalizeOperandsVALUt16(MI, OpIdx, MRI); | ||||||
} | ||||||
|
||||||
void SIInstrInfo::createWaterFall(MachineInstr *MI, MachineDominatorTree *MDT, | ||||||
ArrayRef<MachineOperand *> ScalarOps, | ||||||
ArrayRef<Register> PhySGPRs) const { | ||||||
if (MI->getOpcode() == AMDGPU::SI_CALL_ISEL) { | ||||||
// Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and | ||||||
// following copies, we also need to move copies from and to physical | ||||||
// registers into the loop block. | ||||||
// Also move the copies to physical registers into the loop block | ||||||
MachineBasicBlock &MBB = *MI->getParent(); | ||||||
MachineBasicBlock::iterator Start(MI); | ||||||
while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP) | ||||||
--Start; | ||||||
MachineBasicBlock::iterator End(MI); | ||||||
while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN) | ||||||
++End; | ||||||
|
||||||
// Also include following copies of the return value | ||||||
++End; | ||||||
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | ||||||
|
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() && | |
while (End != MBB.end() && End->isCopy() && |
Outdated
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Better to do this in terms of register classes than register sizes
This file was deleted.
Uh oh!
There was an error while loading. Please reload this page.