Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
344 changes: 313 additions & 31 deletions llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,17 +28,26 @@
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
/// the VGPR_32, the COPY can be completely eliminated.
///
/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16,
/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be
/// co-issued. This helps with overlapping MFMA and certain vector instructions
/// in machine schedules and is expected to improve performance. Only those
/// packed instructions are unpacked that are overlapped by the MFMA latency.
/// Rest should remain untouched.
/// TODO: Add support for F16 packed instructions
//===----------------------------------------------------------------------===//

#include "GCNPreRAOptimizations.h"
#include "AMDGPU.h"
#include "GCNSubtarget.h"
#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
#include "SIInstrInfo.h"
#include "SIRegisterInfo.h"
#include "llvm/ADT/SetVector.h"
#include "llvm/CodeGen/LiveIntervals.h"
#include "llvm/CodeGen/MachineFunctionPass.h"
#include "llvm/CodeGen/MachineInstr.h"
#include "llvm/InitializePasses.h"

using namespace llvm;

#define DEBUG_TYPE "amdgpu-pre-ra-optimizations"
Expand All @@ -53,6 +62,38 @@ class GCNPreRAOptimizationsImpl {
LiveIntervals *LIS;

bool processReg(Register Reg);
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
bool createListOfPackedInstr(MachineInstr &BeginMI,
SetVector<MachineInstr *> &InstrsToUnpack,
uint16_t NumMFMACycles);
// Check if the machine instruction being processed is a supported packed
// instruction
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA.
// Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this
// transformation.
void processF32Unpacking(MachineInstr &I);
// Select corresponding unpacked instruction from packed instruction as input
uint16_t mapToUnpackedOpcode(MachineInstr &I);
// Insert appropriate unpacked instructions into the BB
void insertUnpackedF32MI(MachineInstr &I, bool IsVreg_64, bool IsFMA);
// Creates the unpacked instruction to be inserted. Adds source modifiers to
// the unpacked instructions based on the source modifiers in the packed
// instruction
MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I,
const DebugLoc &DL,
uint16_t UnpackedOpcode, bool IsHiBits,
bool IsFMA);
// Identify register dependencies between those used by the MFMA
// instruction and the following packed instructions. Conservatively ensures
// that we do not incorrectly read/write registers.
bool hasReadWriteDependencies(const MachineInstr &PredMI,
const MachineInstr &SuccMI);

void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods,
unsigned NegModifier, unsigned OpSelModifier,
MachineOperand &SrcMO);

public:
GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {}
Expand Down Expand Up @@ -225,6 +266,228 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
return true;
}

bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should allow for unpacking with immediate operands

MachineInstr &MI) const {
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
case AMDGPU::V_PK_MUL_F32:
case AMDGPU::V_PK_FMA_F32:
return true;
default:
return false;
}
llvm_unreachable("Fully covered switch");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

llvm_unreachable("Fully covered switch");

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping


uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
unsigned Opcode = I.getOpcode();
// Use 64 bit encoding to allow use of VOP3 instructions.
// VOP3 instructions allow VOP3P source modifiers to be translated to VOP3
// e32 instructions are VOP2 and don't allow source modifiers
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
return AMDGPU::V_ADD_F32_e64;
case AMDGPU::V_PK_MUL_F32:
return AMDGPU::V_MUL_F32_e64;
case AMDGPU::V_PK_FMA_F32:
return AMDGPU::V_FMA_F32_e64;
default:
return std::numeric_limits<uint16_t>::max();
}
llvm_unreachable("Fully covered switch");
}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

llvm_unreachable("Fully covered switch");

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping


bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies(
const MachineInstr &PredMI, const MachineInstr &SuccMI) {
for (const MachineOperand &Pred_Ops : PredMI.operands()) {
if (!Pred_Ops.isReg() || !Pred_Ops.isDef())
continue;
Register Pred_Reg = Pred_Ops.getReg();
if (!Pred_Reg.isValid())
continue;
for (const MachineOperand &Succ_Ops : SuccMI.operands()) {
if (!Succ_Ops.isReg() || !Succ_Ops.isDef())
continue;
Register Succ_Reg = Succ_Ops.getReg();
if (!Succ_Reg.isValid())
continue;
if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) {
return true;
}
}
}
return false;
}
bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack,
uint16_t NumMFMACycles) {
auto *BB = BeginMI.getParent();
auto E = BB->end();
int TotalCyclesBetweenCandidates = 0;
auto SchedModel = TII->getSchedModel();
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
const MCSchedClassDesc *InstrSchedClassDesc =
SchedModel.resolveSchedClass(&Instr);
TotalCyclesBetweenCandidates +=
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This cycle count modelling is inaccurate and doesn't properly account for observed latency between instructions. If there are dependencies in the instructions between the MFMA and unpack instruction, we must observe the full latency of the dependee.

For example, if we have a load and a use of the load between the MFMA and the unpack instruction, we will incur the full latency of the load between the MFMA and the unpack candidate.

Another interesting case is if the unpack candidate uses some value from the MFMA. In this case, the unpack candidate must wait for the MFMA regardless.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Improving the cycle count modelling is not something that we should attempt to handle -- I view it more as a downside to the implementation choice.

To do proper cycle count modelling we would basically need to do ad-hoc reconstructions of the schedule DAG for the dependency modelling.

If we do believe that such modelling is needed, we should probably just move this unpacking into scheduling somewhere, and use the schedulers mechanisms to do latency profitability checks.

SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;

if (Instr.isMetaInstruction())
continue;
if (Instr.isTerminator())
return false;
if (TotalCyclesBetweenCandidates > NumMFMACycles)
return false;
if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
if (hasReadWriteDependencies(BeginMI, Instr))
return false;

// If it is a packed instruction, we should subtract it's latency from the
// overall latency calculation here, because the packed instruction will
// be removed and replaced by 2 unpacked instructions
TotalCyclesBetweenCandidates -=
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle;
// We're adding 2 to account for the extra latency added by unpacking into
// 2 instructions. At the time of writing, the considered unpacked
// instructions have latency of 1.
// TODO: improve latency handling of possible inserted instructions
TotalCyclesBetweenCandidates += 2;
if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1))
InstrsToUnpack.insert(&Instr);
}
}
return true;
}

void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(MachineInstr &I,
bool IsVreg_64,
bool IsFMA) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
Register DstReg = I.getOperand(0).getReg();

uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
if (UnpackedOpcode == std::numeric_limits<uint16_t>::max())
return;

MachineInstrBuilder Op0L_Op1L = createUnpackedMI(
MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA);
if (IsVreg_64 || I.getOperand(0).isUndef())
Op0L_Op1L->getOperand(0).setIsUndef();
LIS->InsertMachineInstrInMaps(*Op0L_Op1L);

MachineInstrBuilder Op0H_Op1H = createUnpackedMI(
MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA);
LIS->InsertMachineInstrInMaps(*Op0H_Op1H);

if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) {
Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept);
Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept);
}
if (I.getFlag(MachineInstr::MIFlag::FmContract)) {
Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract);
Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract);
}

LIS->RemoveMachineInstrFromMaps(I);
I.eraseFromParent();
LIS->removeInterval(DstReg);
LIS->createAndComputeVirtRegInterval(DstReg);
return;
}

void GCNPreRAOptimizationsImpl::addOperandandMods(MachineInstrBuilder NewMI,
unsigned Src_Mods,
unsigned NegModifier,
unsigned OpSelModifier,
MachineOperand &SrcMO) {
unsigned New_Src_Mods = 0;
// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit
// lane.
// NEG_HI shares the same bit position with ABS. But packed instructions do
// not support ABS. Therefore, NEG_HI must be translated to NEG source
// modifier for the higher 32 bits. Unpacked VOP3 instructions do support
// ABS, therefore we need to explicitly add the NEG modifier if present in
// the packed instruction
if (Src_Mods & NegModifier) {
New_Src_Mods |= SISrcMods::NEG;
}
// Src modifiers. Only negative modifiers are added if needed. Unpacked
// operations do not have op_sel, therefore it must be handled explicitly as
// done below. Unpacked operations support abs, but packed instructions do
// not. Thus, abs is not handled.
NewMI.addImm(New_Src_Mods);
if (SrcMO.isImm()) {
NewMI.addImm(SrcMO.getImm());
} else {
if (Src_Mods & OpSelModifier) {
unsigned Src0SubIdx =
TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub1);
NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); // src0
} else {
unsigned Src0SubIdx =
TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub0);
// If op_sel == 0, select register 0 of reg:sub0_sub1
NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx);
}
}
}

MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI(
MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL,
uint16_t UnpackedOpcode, bool IsHiBits, bool IsFMA) {
MachineOperand &DstMO = I.getOperand(0);
MachineOperand &SrcMO1 = I.getOperand(2);
MachineOperand &SrcMO2 = I.getOperand(4);
Register DstReg = DstMO.getReg();
unsigned DestSubIdx =
IsHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1)
: TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0);
int ClampIdx =
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp);
int64_t ClampVal = I.getOperand(ClampIdx).getImm();
int Src0_modifiers_Idx =
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers);
int Src1_modifiers_Idx =
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers);

unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm();
unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm();
// Packed instructions (VOP3P) do not support abs. It is okay to ignore them.
unsigned New_Src0_Mods = 0;
unsigned New_Src1_Mods = 0;

unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;

MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(DstReg, 0, DestSubIdx); // vdst
addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1);
addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2);

if (IsFMA) {
MachineOperand &SrcMO3 = I.getOperand(6);
int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx(
I.getOpcode(), AMDGPU::OpName::src2_modifiers);
unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm();
addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3);
}
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
// for this use case
NewMI.addImm(0); // omod
return NewMI;
}

void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) {
bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false;
const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg());
bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID);
insertUnpackedF32MI(I, IsVReg64, IsFMA);
return;
}

bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
if (skipFunction(MF.getFunction()))
return false;
Expand Down Expand Up @@ -260,38 +523,57 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
Changed |= processReg(Reg);
}

if (!ST.useRealTrue16Insts())
return Changed;

// Add RA hints to improve True16 COPY elimination.
for (const MachineBasicBlock &MBB : MF) {
for (const MachineInstr &MI : MBB) {
if (MI.getOpcode() != AMDGPU::COPY)
continue;
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
if (Dst.isVirtual() &&
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
Src.isPhysical() &&
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
if (Src.isVirtual() &&
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
Dst.isPhysical() &&
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
if (!Dst.isVirtual() || !Src.isVirtual())
continue;
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
// Unpack packed instructions to overlap MFMAs. This allows the compiler to
// co-issue unpacked instructions with MFMA
for (MachineBasicBlock &MBB : MF) {
SetVector<MachineInstr *> InstrsToUnpack;
SetVector<MachineOperand *> WriteOperands;
SetVector<MachineOperand *> ReadOperands;
uint16_t NumMFMACycles = 0;
auto SchedModel = TII->getSchedModel();
for (MachineInstr &MI : MBB) {
if (SIInstrInfo::isMFMA(MI)) {
const MCSchedClassDesc *SchedClassDesc =
SchedModel.resolveSchedClass(&MI);
NumMFMACycles =
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should we reduce for the issue latency of the MFMA

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ping

SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles);
}
if (ST.useRealTrue16Insts()) {
if (MI.getOpcode() != AMDGPU::COPY)
continue;
Register Dst = MI.getOperand(0).getReg();
Register Src = MI.getOperand(1).getReg();
if (Dst.isVirtual() &&
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
Src.isPhysical() &&
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
if (Src.isVirtual() &&
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
Dst.isPhysical() &&
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
if (!Dst.isVirtual() || !Src.isVirtual())
continue;
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
}
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
}
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
}
}

if (!InstrsToUnpack.empty()) {
for (MachineInstr *MI : InstrsToUnpack) {
processF32Unpacking(*MI);
}
}
}
LIS->reanalyze(MF);
return Changed;
}
}
Loading