Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
147 changes: 145 additions & 2 deletions llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "GCNSubtarget.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include <deque>

using namespace llvm;

Expand Down Expand Up @@ -50,6 +51,7 @@ class SIPostRABundler {
bool run(MachineFunction &MF);

private:
const SIInstrInfo *TII = nullptr;
const SIRegisterInfo *TRI;

SmallSet<Register, 16> Defs;
Expand All @@ -60,6 +62,9 @@ class SIPostRABundler {
bool isBundleCandidate(const MachineInstr &MI) const;
bool isDependentLoad(const MachineInstr &MI) const;
bool canBundle(const MachineInstr &MI, const MachineInstr &NextMI) const;
void reorderLoads(MachineBasicBlock &MBB,
MachineBasicBlock::instr_iterator &BundleStart,
MachineBasicBlock::instr_iterator Next);
};

constexpr uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
Expand Down Expand Up @@ -129,6 +134,141 @@ bool SIPostRABundler::canBundle(const MachineInstr &MI,
!isDependentLoad(NextMI));
}

static Register getDef(MachineInstr &MI) {
assert(MI.getNumExplicitDefs() > 0);
return MI.defs().begin()->getReg();
}

void SIPostRABundler::reorderLoads(
MachineBasicBlock &MBB, MachineBasicBlock::instr_iterator &BundleStart,
MachineBasicBlock::instr_iterator Next) {
// Don't reorder ALU, store or scalar clauses.
if (!BundleStart->mayLoad() || BundleStart->mayStore() ||
SIInstrInfo::isSMRD(*BundleStart) || !BundleStart->getNumExplicitDefs())
return;

// Search to find the usage distance of each defined register in the clause.
const unsigned SearchDistance = std::max(Defs.size(), 100UL);
SmallDenseMap<Register, unsigned> UseDistance;
unsigned MaxDistance = 0;
for (MachineBasicBlock::iterator SearchI = Next;
SearchI != MBB.end() && MaxDistance < SearchDistance &&
UseDistance.size() < Defs.size();
++SearchI, ++MaxDistance) {
for (Register Reg : Defs) {
if (UseDistance.contains(Reg))
continue;
if (SearchI->readsRegister(Reg, TRI))
UseDistance[Reg] = MaxDistance;
}
}

if (UseDistance.empty())
return;

LLVM_DEBUG(dbgs() << "Try bundle reordering\n");

// Build schedule based on use distance of register uses.
// Attempt to preserve exist order (NativeOrder) where possible.
std::deque<std::pair<MachineInstr *, unsigned>> Schedule;
unsigned NativeOrder = 0, LastOrder = 0;
bool Reordered = false;
for (auto II = BundleStart; II != Next; ++II, ++NativeOrder) {
// Bail out if we encounter anything that seems risky to reorder.
if (!II->getNumExplicitDefs() || II->isKill() ||
llvm::any_of(II->memoperands(), [&](const MachineMemOperand *MMO) {
return MMO->isAtomic() || MMO->isVolatile();
})) {
LLVM_DEBUG(dbgs() << " Abort\n");
return;
}

Register Reg = getDef(*II);
unsigned NewOrder =
UseDistance.contains(Reg) ? UseDistance[Reg] : MaxDistance;
LLVM_DEBUG(dbgs() << " Order: " << NewOrder << "," << NativeOrder
<< ", MI: " << *II);
unsigned Order = (NewOrder << 16 | NativeOrder);
Schedule.emplace_back(&*II, Order);
Reordered |= Order < LastOrder;
LastOrder = Order;
}

// No reordering found.
if (!Reordered) {
LLVM_DEBUG(dbgs() << " No changes\n");
return;
}

// Apply sort on new ordering.
std::sort(Schedule.begin(), Schedule.end(),
[](std::pair<MachineInstr *, unsigned> A,
std::pair<MachineInstr *, unsigned> B) {
return A.second < B.second;
});

// Rebuild clause order.
// Schedule holds ideal order for the load operations; however, each def
// can only be scheduled when it will no longer clobber any uses.
SmallVector<MachineInstr *> Clause;
while (!Schedule.empty()) {
// Try to schedule next instruction in schedule.
// Iterate until we find something that can be placed.
auto It = Schedule.begin();
while (It != Schedule.end()) {
MachineInstr *MI = It->first;
LLVM_DEBUG(dbgs() << "Try schedule: " << *MI);

if (MI->getNumExplicitDefs() == 0) {
// No defs, always schedule.
LLVM_DEBUG(dbgs() << " Trivially OK\n");
break;
}

Register DefReg = getDef(*MI);
bool DefRegHasUse = false;
for (auto SearchIt = std::next(It);
SearchIt != Schedule.end() && !DefRegHasUse; ++SearchIt)
DefRegHasUse = SearchIt->first->readsRegister(DefReg, TRI);
if (DefRegHasUse) {
// A future use would be clobbered; try next instruction in the
// schedule.
LLVM_DEBUG(dbgs() << " Clobbers uses\n");
It++;
continue;
}

// Safe to schedule.
LLVM_DEBUG(dbgs() << " OK!\n");
break;
}

// Place schedule instruction into clause order.
assert(It != Schedule.end());
MachineInstr *MI = It->first;
Schedule.erase(It);
Clause.push_back(MI);

// Clear kill flags for later uses.
for (auto &Use : MI->all_uses()) {
if (!Use.isReg() || !Use.isKill())
continue;
Register UseReg = Use.getReg();
if (llvm::any_of(Schedule, [&](std::pair<MachineInstr *, unsigned> &SI) {
return SI.first->readsRegister(UseReg, TRI);
}))
Use.setIsKill(false);
}
}

// Apply order to instructions.
for (MachineInstr *MI : Clause)
MI->moveBefore(&*Next);

// Update start of bundle.
BundleStart = Clause[0]->getIterator();
}

bool SIPostRABundlerLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
Expand All @@ -143,6 +283,8 @@ PreservedAnalyses SIPostRABundlerPass::run(MachineFunction &MF,

bool SIPostRABundler::run(MachineFunction &MF) {

const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
TII = ST.getInstrInfo();
TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
BitVector BundleUsedRegUnits(TRI->getNumRegUnits());
BitVector KillUsedRegUnits(TRI->getNumRegUnits());
Expand Down Expand Up @@ -170,7 +312,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
assert(Defs.empty());

if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
Defs.insert(getDef(*I));

MachineBasicBlock::instr_iterator BundleStart = I;
MachineBasicBlock::instr_iterator BundleEnd = I;
Expand All @@ -182,7 +324,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
if (canBundle(*BundleEnd, *I)) {
BundleEnd = I;
if (I->getNumExplicitDefs() != 0)
Defs.insert(I->defs().begin()->getReg());
Defs.insert(getDef(*I));
++ClauseLength;
} else if (!I->isMetaInstruction() ||
I->getOpcode() == AMDGPU::SCHED_BARRIER) {
Expand Down Expand Up @@ -234,6 +376,7 @@ bool SIPostRABundler::run(MachineFunction &MF) {
BundleUsedRegUnits.reset();
}

reorderLoads(MBB, BundleStart, Next);
finalizeBundle(MBB, BundleStart, Next);
}

Expand Down
6 changes: 3 additions & 3 deletions llvm/test/CodeGen/AMDGPU/GlobalISel/add.vni16.ll
Original file line number Diff line number Diff line change
Expand Up @@ -716,17 +716,17 @@ define void @add_v11i16(ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb, ptr addr
; GFX9-LABEL: add_v11i16:
; GFX9: ; %bb.0:
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_ushort v14, v[0:1], off offset:16
; GFX9-NEXT: global_load_ushort v15, v[2:3], off offset:16
; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off
; GFX9-NEXT: global_load_dwordx4 v[10:13], v[2:3], off
; GFX9-NEXT: global_load_ushort v16, v[2:3], off offset:20
; GFX9-NEXT: global_load_ushort v17, v[0:1], off offset:20
; GFX9-NEXT: global_load_ushort v18, v[0:1], off offset:18
; GFX9-NEXT: global_load_ushort v19, v[2:3], off offset:18
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: s_waitcnt vmcnt(7)
; GFX9-NEXT: v_and_b32_e32 v14, 0xffff, v14
; GFX9-NEXT: s_waitcnt vmcnt(5)
; GFX9-NEXT: s_waitcnt vmcnt(6)
; GFX9-NEXT: v_and_b32_e32 v15, 0xffff, v15
; GFX9-NEXT: s_waitcnt vmcnt(4)
; GFX9-NEXT: v_pk_add_u16 v0, v6, v10
Expand Down
Loading
Loading