Skip to content
Open
Show file tree
Hide file tree
Changes from 32 commits
Commits
Show all changes
35 commits
Select commit Hold shift + click to select a range
024cb4f
generate waterfall for calls with sgpr argument(inreg)
Shoreshen Jul 4, 2025
a2ae8c8
search only within block
Shoreshen Jul 4, 2025
6709725
remove head file
Shoreshen Jul 4, 2025
31bbc2b
fis jmmartinez's comments
Shoreshen Jul 10, 2025
502cb2c
fix juan & matthew & shilei's comments
Shoreshen Jul 21, 2025
7434975
fix matthew's comment
Shoreshen Jul 21, 2025
d8befc9
move handling to movetoVALU, merge waterfall gen for SI_CALL_ISEL
Shoreshen Jul 23, 2025
c744d82
fix unrelated change
Shoreshen Jul 23, 2025
e3c1295
fix shilei's comments
Shoreshen Jul 24, 2025
8bd9a66
Add decription or createWaterFall
Shoreshen Jul 28, 2025
79f8dd5
fix Juan's comments
Shoreshen Aug 1, 2025
d0c6ecd
fix matthew's comments
Shoreshen Aug 12, 2025
6af7b97
fix format error
Shoreshen Aug 12, 2025
8543612
fix format
Shoreshen Sep 5, 2025
ad6a65d
fix test
Shoreshen Sep 12, 2025
ab0476f
fix conflict
Shoreshen Sep 12, 2025
f7a7ea1
fix comments
Shoreshen Sep 12, 2025
5815b3c
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 15, 2025
fd01203
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 16, 2025
9e73fab
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 17, 2025
54cf1e6
fix comments
Shoreshen Sep 17, 2025
3f5cab9
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 22, 2025
50f01fd
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 24, 2025
015026d
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 26, 2025
41cd451
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 26, 2025
892cc6a
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 28, 2025
0a1318f
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 28, 2025
2a810be
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Sep 29, 2025
a43e0bd
Merge remote-tracking branch 'origin/main' into inreg-call-gen-waterfall
Shoreshen Oct 9, 2025
8aeb5bf
MRI.constrainRegClass never pass for phy reg
Shoreshen Oct 9, 2025
46237f6
fix test case
Shoreshen Oct 9, 2025
53542ec
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Oct 10, 2025
8391ff3
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Oct 11, 2025
8d43378
fix comment
Shoreshen Oct 11, 2025
3ac45a6
Merge branch 'main' into inreg-call-gen-waterfall
Shoreshen Oct 13, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -910,12 +910,18 @@ bool SIFixSGPRCopies::lowerSpecialCase(MachineInstr &MI,
TII->get(AMDGPU::V_READFIRSTLANE_B32), TmpReg)
.add(MI.getOperand(1));
MI.getOperand(1).setReg(TmpReg);
} else if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(),
MI, MI.getDebugLoc())) {
return true;
}

if (tryMoveVGPRConstToSGPR(MI.getOperand(1), DstReg, MI.getParent(), MI,
MI.getDebugLoc())) {
I = std::next(I);
MI.eraseFromParent();
return true;
}
return true;

if (!SrcReg.isVirtual())
return true;
Comment on lines +923 to +924
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand this early exit, the !SrcReg.isVirtual() has existing explicit handling just below here

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @arsenm , the upcoming if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) is used to handle the situation that DstReg.isVirtual() and !SrcReg.isVirtual(). If !DstReg.isVirtual() we shall skip this copy.

}
if (!SrcReg.isVirtual() || TRI->isAGPR(*MRI, SrcReg)) {
SIInstrWorklist worklist;
Expand All @@ -941,7 +947,7 @@ void SIFixSGPRCopies::analyzeVGPRToSGPRCopy(MachineInstr* MI) {
if (PHISources.contains(MI))
return;
Register DstReg = MI->getOperand(0).getReg();
const TargetRegisterClass *DstRC = MRI->getRegClass(DstReg);
const TargetRegisterClass *DstRC = TRI->getRegClassForReg(*MRI, DstReg);

V2SCopyInfo Info(getNextVGPRToSGPRCopyId(), MI,
TRI->getRegSizeInBits(*DstRC));
Expand Down
184 changes: 139 additions & 45 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6861,13 +6861,10 @@ void SIInstrInfo::legalizeGenericOperand(MachineBasicBlock &InsertMBB,
// Emit the actual waterfall loop, executing the wrapped instruction for each
// unique value of \p ScalarOps across all lanes. In the best case we execute 1
// iteration, in the worst case we execute 64 (once per lane).
static void
emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
MachineRegisterInfo &MRI,
MachineBasicBlock &LoopBB,
MachineBasicBlock &BodyBB,
const DebugLoc &DL,
ArrayRef<MachineOperand *> ScalarOps) {
static void emitLoadScalarOpsFromVGPRLoop(
const SIInstrInfo &TII, MachineRegisterInfo &MRI, MachineBasicBlock &LoopBB,
MachineBasicBlock &BodyBB, const DebugLoc &DL,
ArrayRef<MachineOperand *> ScalarOps, ArrayRef<Register> PhySGPRs = {}) {
MachineFunction &MF = *LoopBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
const SIRegisterInfo *TRI = ST.getRegisterInfo();
Expand All @@ -6876,8 +6873,7 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,

MachineBasicBlock::iterator I = LoopBB.begin();
Register CondReg;

for (MachineOperand *ScalarOp : ScalarOps) {
for (auto [Idx, ScalarOp] : enumerate(ScalarOps)) {
unsigned RegSize = TRI->getRegSizeInBits(ScalarOp->getReg(), MRI);
unsigned NumSubRegs = RegSize / 32;
Register VScalarOp = ScalarOp->getReg();
Expand Down Expand Up @@ -6906,7 +6902,16 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
}

// Update ScalarOp operand to use the SGPR ScalarOp.
ScalarOp->setReg(CurReg);
if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
ScalarOp->setReg(CurReg);
else {
// Insert into the same block of use
BuildMI(*ScalarOp->getParent()->getParent(),
ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY),
PhySGPRs[Idx])
.addReg(CurReg);
ScalarOp->setReg(PhySGPRs[Idx]);
}
ScalarOp->setIsKill();
} else {
SmallVector<Register, 8> ReadlanePieces;
Expand Down Expand Up @@ -6975,7 +6980,15 @@ emitLoadScalarOpsFromVGPRLoop(const SIInstrInfo &TII,
}

// Update ScalarOp operand to use the SGPR ScalarOp.
ScalarOp->setReg(SScalarOp);
if (PhySGPRs.empty() || !PhySGPRs[Idx].isValid())
ScalarOp->setReg(SScalarOp);
else {
BuildMI(*ScalarOp->getParent()->getParent(),
ScalarOp->getParent()->getIterator(), DL, TII.get(AMDGPU::COPY),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Avoid repeating ScalarOp->getParent()->getIterator()

PhySGPRs[Idx])
.addReg(SScalarOp);
ScalarOp->setReg(PhySGPRs[Idx]);
}
ScalarOp->setIsKill();
}
}
Expand Down Expand Up @@ -7006,7 +7019,10 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
ArrayRef<MachineOperand *> ScalarOps,
MachineDominatorTree *MDT,
MachineBasicBlock::iterator Begin = nullptr,
MachineBasicBlock::iterator End = nullptr) {
MachineBasicBlock::iterator End = nullptr,
ArrayRef<Register> PhySGPRs = {}) {
assert((PhySGPRs.empty() || PhySGPRs.size() == ScalarOps.size()) &&
"Physical SGPRs must be empty or match the number of scalar operands");
MachineBasicBlock &MBB = *MI.getParent();
MachineFunction &MF = *MBB.getParent();
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
Expand Down Expand Up @@ -7091,7 +7107,8 @@ loadMBUFScalarOperandsFromVGPR(const SIInstrInfo &TII, MachineInstr &MI,
}
}

emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps);
emitLoadScalarOpsFromVGPRLoop(TII, MRI, *LoopBB, *BodyBB, DL, ScalarOps,
PhySGPRs);
Copy link

Copilot AI Jul 4, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Inserting a waterfall loop alters control flow and can affect debug info and profiling data. Ensure debug locations and profiling intrinsics are preserved or updated for accurate performance analysis.

Copilot generated this review using guidance from repository custom instructions.


MachineBasicBlock::iterator First = RemainderBB->begin();
// Restore SCC
Expand Down Expand Up @@ -7328,27 +7345,7 @@ SIInstrInfo::legalizeOperands(MachineInstr &MI,
if (MI.getOpcode() == AMDGPU::SI_CALL_ISEL) {
MachineOperand *Dest = &MI.getOperand(0);
if (!RI.isSGPRClass(MRI.getRegClass(Dest->getReg()))) {
// Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
// following copies, we also need to move copies from and to physical
// registers into the loop block.
unsigned FrameSetupOpcode = getCallFrameSetupOpcode();
unsigned FrameDestroyOpcode = getCallFrameDestroyOpcode();

// Also move the copies to physical registers into the loop block
MachineBasicBlock &MBB = *MI.getParent();
MachineBasicBlock::iterator Start(&MI);
while (Start->getOpcode() != FrameSetupOpcode)
--Start;
MachineBasicBlock::iterator End(&MI);
while (End->getOpcode() != FrameDestroyOpcode)
++End;
// Also include following copies of the return value
++End;
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
MI.definesRegister(End->getOperand(1).getReg(), /*TRI=*/nullptr))
++End;
CreatedBB =
loadMBUFScalarOperandsFromVGPR(*this, MI, {Dest}, MDT, Start, End);
createWaterFall(&MI, MDT, {Dest});
}
}

Expand Down Expand Up @@ -7611,6 +7608,33 @@ void SIInstrInfo::legalizeOperandsVALUt16(MachineInstr &MI,
legalizeOperandsVALUt16(MI, OpIdx, MRI);
}

void SIInstrInfo::createWaterFall(MachineInstr *MI, MachineDominatorTree *MDT,
ArrayRef<MachineOperand *> ScalarOps,
ArrayRef<Register> PhySGPRs) const {
if (MI->getOpcode() == AMDGPU::SI_CALL_ISEL) {
// Move everything between ADJCALLSTACKUP and ADJCALLSTACKDOWN and
// following copies, we also need to move copies from and to physical
// registers into the loop block.
// Also move the copies to physical registers into the loop block
MachineBasicBlock &MBB = *MI->getParent();
MachineBasicBlock::iterator Start(MI);
while (Start->getOpcode() != AMDGPU::ADJCALLSTACKUP)
--Start;
MachineBasicBlock::iterator End(MI);
while (End->getOpcode() != AMDGPU::ADJCALLSTACKDOWN)
++End;

// Also include following copies of the return value
++End;
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If End is a copy, the operands must be registers

Suggested change
while (End != MBB.end() && End->isCopy() && End->getOperand(1).isReg() &&
while (End != MBB.end() && End->isCopy() &&

MI->definesRegister(End->getOperand(1).getReg(), &RI))
++End;

loadMBUFScalarOperandsFromVGPR(*this, *MI, ScalarOps, MDT, Start, End,
PhySGPRs);
}
}

void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
MachineDominatorTree *MDT) const {

Expand All @@ -7630,6 +7654,86 @@ void SIInstrInfo::moveToVALU(SIInstrWorklist &Worklist,
assert(Worklist.empty() &&
"Deferred MachineInstr are not supposed to re-populate worklist");
}

for (std::pair<MachineInstr *, V2PhysSCopyInfo> &Entry : Worklist.WaterFalls)
createWaterFall(Entry.first, MDT, Entry.second.MOs, Entry.second.SGPRs);

for (std::pair<MachineInstr *, bool> Entry : Worklist.V2PhySCopiesToErase)
if (Entry.second)
Entry.first->eraseFromParent();
}
void SIInstrInfo::createReadFirstLaneFromCopyToPhysReg(
MachineRegisterInfo &MRI, Register DstReg, MachineInstr &Inst) const {
// If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
// hope for the best.
unsigned RegSize = RI.getRegSizeInBits(DstReg, MRI);
unsigned NumSubRegs = RegSize / 32;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Better to do this in terms of register classes than register sizes

if (NumSubRegs == 1) {
Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
.add(Inst.getOperand(1));
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
DstReg)
.addReg(NewDst);
} else {
SmallVector<Register, 8> DstRegs;
for (unsigned i = 0; i < NumSubRegs; ++i) {
Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
.addReg(Inst.getOperand(1).getReg(), 0, RI.getSubRegFromChannel(i));
DstRegs.push_back(NewDst);
}
MachineInstrBuilder MIB =
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::REG_SEQUENCE), DstReg);
for (unsigned i = 0; i < NumSubRegs; ++i) {
MIB.addReg(DstRegs[i]);
MIB.addImm(RI.getSubRegFromChannel(i));
}
}
}

void SIInstrInfo::handleCopyToPhyHelper(SIInstrWorklist &Worklist,
Register DstReg, MachineInstr &Inst,
MachineRegisterInfo &MRI) const {
if (DstReg == AMDGPU::M0) {
createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
Worklist.V2PhySCopiesToErase.try_emplace(&Inst, true);
return;
}
Register SrcReg = Inst.getOperand(1).getReg();
MachineBasicBlock::iterator I = Inst.getIterator();
MachineBasicBlock::iterator E = Inst.getParent()->end();
// Only search current block since phyreg's def & use cannot cross
// blocks when MF.NoPhi = false.
while (++I != E) {
// Currently, we only support waterfall on SI_CALL_ISEL.
if (I->getOpcode() == AMDGPU::SI_CALL_ISEL) {
MachineInstr *UseMI = &*I;
for (unsigned i = 0; i < UseMI->getNumOperands(); ++i) {
if (UseMI->getOperand(i).isReg() &&
UseMI->getOperand(i).getReg() == DstReg) {
MachineOperand *MO = &UseMI->getOperand(i);
MO->setReg(SrcReg);
V2PhysSCopyInfo &V2SCopyInfo = Worklist.WaterFalls[UseMI];
V2SCopyInfo.MOs.push_back(MO);
V2SCopyInfo.SGPRs.push_back(DstReg);
Worklist.V2PhySCopiesToErase.try_emplace(&Inst, true);
}
}
} else if (I->getOpcode() == AMDGPU::SI_RETURN_TO_EPILOG &&
I->getOperand(0).isReg() &&
I->getOperand(0).getReg() == DstReg) {
createReadFirstLaneFromCopyToPhysReg(MRI, DstReg, Inst);
Worklist.V2PhySCopiesToErase.try_emplace(&Inst, true);
} else if (I->readsRegister(DstReg, &RI))
// COPY cannot be erased if other type of inst uses it.
Worklist.V2PhySCopiesToErase[&Inst] = false;
if (I->findRegisterDefOperand(DstReg, &RI))
break;
}
}

void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Expand Down Expand Up @@ -8106,19 +8210,9 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
Register DstReg = Inst.getOperand(0).getReg();
const TargetRegisterClass *NewDstRC = getDestEquivalentVGPRClass(Inst);

// If it's a copy of a VGPR to a physical SGPR, insert a V_READFIRSTLANE and
// hope for the best.
if (Inst.isCopy() && DstReg.isPhysical() &&
RI.isVGPR(MRI, Inst.getOperand(1).getReg())) {
Register NewDst = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(),
get(AMDGPU::V_READFIRSTLANE_B32), NewDst)
.add(Inst.getOperand(1));
BuildMI(*Inst.getParent(), &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY),
DstReg)
.addReg(NewDst);

Inst.eraseFromParent();
handleCopyToPhyHelper(Worklist, DstReg, Inst, MRI);
return;
}

Expand Down
24 changes: 24 additions & 0 deletions llvm/lib/Target/AMDGPU/SIInstrInfo.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,13 @@ static const MachineMemOperand::Flags MOLastUse =
static const MachineMemOperand::Flags MOCooperative =
MachineMemOperand::MOTargetFlag3;

struct V2PhysSCopyInfo {
// Operands that need to replaced by waterfall
SmallVector<MachineOperand *> MOs;
// Target physical registers replacing the MOs
SmallVector<Register> SGPRs;
};

/// Utility to store machine instructions worklist.
struct SIInstrWorklist {
SIInstrWorklist() = default;
Expand Down Expand Up @@ -79,6 +86,9 @@ struct SIInstrWorklist {

SetVector<MachineInstr *> &getDeferredList() { return DeferredList; }

DenseMap<MachineInstr *, V2PhysSCopyInfo> WaterFalls;
DenseMap<MachineInstr *, bool> V2PhySCopiesToErase;

private:
/// InstrList contains the MachineInstrs.
SetVector<MachineInstr *> InstrList;
Expand Down Expand Up @@ -1407,6 +1417,12 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {

void moveToVALUImpl(SIInstrWorklist &Worklist, MachineDominatorTree *MDT,
MachineInstr &Inst) const;
/// Wrapper function for generating waterfall for instruction \p MI
/// This function take into consideration of related pre & succ instructions
/// (e.g. calling process) into consideratioin
void createWaterFall(MachineInstr *MI, MachineDominatorTree *MDT,
ArrayRef<MachineOperand *> ScalarOps,
ArrayRef<Register> PhySGPRs = {}) const;

void insertNoop(MachineBasicBlock &MBB,
MachineBasicBlock::iterator MI) const override;
Expand Down Expand Up @@ -1595,6 +1611,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {

const TargetSchedModel &getSchedModel() const { return SchedModel; }

void createReadFirstLaneFromCopyToPhysReg(MachineRegisterInfo &MRI,
Register DstReg,
MachineInstr &Inst) const;

void handleCopyToPhyHelper(SIInstrWorklist &Worklist, Register DstReg,
MachineInstr &Inst,
MachineRegisterInfo &MRI) const;

// Enforce operand's \p OpName even alignment if required by target.
// This is used if an operand is a 32 bit register but needs to be aligned
// regardless.
Expand Down

This file was deleted.

Loading