-
Notifications
You must be signed in to change notification settings - Fork 14.9k
Co-issue packed instructions by unpacking #151704
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
7c44328
4bff965
d3b19c6
c581612
c695b99
1a51a42
e9056e8
5cb47d2
178a363
7acafc4
cf4cb9e
0c0fe5d
a77dab9
c467ed5
1bcbeba
8ed311e
064fa84
c9bfd29
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -28,17 +28,26 @@ | |
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of | ||
/// the VGPR_32, the COPY can be completely eliminated. | ||
/// | ||
/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32/F16, | ||
/// V_PK_ADD_F32/F16, V_PK_FMA_F32) adjacent to MFMAs such that they can be | ||
/// co-issued. This helps with overlapping MFMA and certain vector instructions | ||
/// in machine schedules and is expected to improve performance. Only those | ||
/// packed instructions are unpacked that are overlapped by the MFMA latency. | ||
/// Rest should remain untouched. | ||
/// TODO: Add support for F16 packed instructions | ||
//===----------------------------------------------------------------------===// | ||
|
||
#include "GCNPreRAOptimizations.h" | ||
#include "AMDGPU.h" | ||
#include "GCNSubtarget.h" | ||
#include "MCTargetDesc/AMDGPUMCTargetDesc.h" | ||
#include "SIInstrInfo.h" | ||
#include "SIRegisterInfo.h" | ||
#include "llvm/ADT/SetVector.h" | ||
#include "llvm/CodeGen/LiveIntervals.h" | ||
#include "llvm/CodeGen/MachineFunctionPass.h" | ||
#include "llvm/CodeGen/MachineInstr.h" | ||
#include "llvm/InitializePasses.h" | ||
|
||
using namespace llvm; | ||
|
||
#define DEBUG_TYPE "amdgpu-pre-ra-optimizations" | ||
|
@@ -53,6 +62,38 @@ class GCNPreRAOptimizationsImpl { | |
LiveIntervals *LIS; | ||
|
||
bool processReg(Register Reg); | ||
// Creates a list of packed instructions following an MFMA that are suitable | ||
// for unpacking. | ||
bool createListOfPackedInstr(MachineInstr &BeginMI, | ||
SetVector<MachineInstr *> &InstrsToUnpack, | ||
uint16_t NumMFMACycles); | ||
// Check if the machine instruction being processed is a supported packed | ||
// instruction | ||
bool isUnpackingSupportedInstr(MachineInstr &MI) const; | ||
// Unpack F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and V_PK_FMA. | ||
// Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for this | ||
// transformation. | ||
void processF32Unpacking(MachineInstr &I); | ||
// Select corresponding unpacked instruction from packed instruction as input | ||
uint16_t mapToUnpackedOpcode(MachineInstr &I); | ||
// Insert appropriate unpacked instructions into the BB | ||
void insertUnpackedF32MI(MachineInstr &I, bool IsVreg_64, bool IsFMA); | ||
// Creates the unpacked instruction to be inserted. Adds source modifiers to | ||
// the unpacked instructions based on the source modifiers in the packed | ||
// instruction | ||
MachineInstrBuilder createUnpackedMI(MachineBasicBlock &MBB, MachineInstr &I, | ||
const DebugLoc &DL, | ||
uint16_t UnpackedOpcode, bool IsHiBits, | ||
bool IsFMA); | ||
// Identify register dependencies between those used by the MFMA | ||
// instruction and the following packed instructions. Conservatively ensures | ||
// that we do not incorrectly read/write registers. | ||
bool hasReadWriteDependencies(const MachineInstr &PredMI, | ||
const MachineInstr &SuccMI); | ||
|
||
void addOperandandMods(MachineInstrBuilder NewMI, unsigned Src_Mods, | ||
unsigned NegModifier, unsigned OpSelModifier, | ||
MachineOperand &SrcMO); | ||
|
||
public: | ||
GCNPreRAOptimizationsImpl(LiveIntervals *LS) : LIS(LS) {} | ||
|
@@ -225,6 +266,228 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) { | |
return true; | ||
} | ||
|
||
bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should allow for unpacking with immediate operands |
||
MachineInstr &MI) const { | ||
unsigned Opcode = MI.getOpcode(); | ||
switch (Opcode) { | ||
case AMDGPU::V_PK_ADD_F32: | ||
case AMDGPU::V_PK_MUL_F32: | ||
case AMDGPU::V_PK_FMA_F32: | ||
return true; | ||
default: | ||
return false; | ||
} | ||
llvm_unreachable("Fully covered switch"); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. llvm_unreachable("Fully covered switch"); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping |
||
|
||
uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) { | ||
jrbyrnes marked this conversation as resolved.
Show resolved
Hide resolved
|
||
unsigned Opcode = I.getOpcode(); | ||
// Use 64 bit encoding to allow use of VOP3 instructions. | ||
// VOP3 instructions allow VOP3P source modifiers to be translated to VOP3 | ||
// e32 instructions are VOP2 and don't allow source modifiers | ||
switch (Opcode) { | ||
case AMDGPU::V_PK_ADD_F32: | ||
return AMDGPU::V_ADD_F32_e64; | ||
case AMDGPU::V_PK_MUL_F32: | ||
return AMDGPU::V_MUL_F32_e64; | ||
case AMDGPU::V_PK_FMA_F32: | ||
return AMDGPU::V_FMA_F32_e64; | ||
default: | ||
return std::numeric_limits<uint16_t>::max(); | ||
} | ||
llvm_unreachable("Fully covered switch"); | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. llvm_unreachable("Fully covered switch"); There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping |
||
|
||
bool GCNPreRAOptimizationsImpl::hasReadWriteDependencies( | ||
const MachineInstr &PredMI, const MachineInstr &SuccMI) { | ||
for (const MachineOperand &Pred_Ops : PredMI.operands()) { | ||
if (!Pred_Ops.isReg() || !Pred_Ops.isDef()) | ||
continue; | ||
Register Pred_Reg = Pred_Ops.getReg(); | ||
if (!Pred_Reg.isValid()) | ||
continue; | ||
for (const MachineOperand &Succ_Ops : SuccMI.operands()) { | ||
if (!Succ_Ops.isReg() || !Succ_Ops.isDef()) | ||
continue; | ||
Register Succ_Reg = Succ_Ops.getReg(); | ||
if (!Succ_Reg.isValid()) | ||
continue; | ||
if ((Pred_Reg == Succ_Reg) || TRI->regsOverlap(Pred_Reg, Succ_Reg)) { | ||
return true; | ||
} | ||
} | ||
} | ||
return false; | ||
} | ||
bool GCNPreRAOptimizationsImpl::createListOfPackedInstr( | ||
MachineInstr &BeginMI, SetVector<MachineInstr *> &InstrsToUnpack, | ||
uint16_t NumMFMACycles) { | ||
auto *BB = BeginMI.getParent(); | ||
auto E = BB->end(); | ||
int TotalCyclesBetweenCandidates = 0; | ||
auto SchedModel = TII->getSchedModel(); | ||
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) { | ||
MachineInstr &Instr = *I; | ||
const MCSchedClassDesc *InstrSchedClassDesc = | ||
SchedModel.resolveSchedClass(&Instr); | ||
TotalCyclesBetweenCandidates += | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This cycle count modelling is inaccurate and doesn't properly account for observed latency between instructions. If there are dependencies in the instructions between the MFMA and unpack instruction, we must observe the full latency of the dependee. For example, if we have a load and a use of the load between the MFMA and the unpack instruction, we will incur the full latency of the load between the MFMA and the unpack candidate. Another interesting case is if the unpack candidate uses some value from the MFMA. In this case, the unpack candidate must wait for the MFMA regardless. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Improving the cycle count modelling is not something that we should attempt to handle -- I view it more as a downside to the implementation choice. To do proper cycle count modelling we would basically need to do ad-hoc reconstructions of the schedule DAG for the dependency modelling. If we do believe that such modelling is needed, we should probably just move this unpacking into scheduling somewhere, and use the schedulers mechanisms to do latency profitability checks. |
||
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; | ||
|
||
if (Instr.isMetaInstruction()) | ||
continue; | ||
if (Instr.isTerminator()) | ||
return false; | ||
if (TotalCyclesBetweenCandidates > NumMFMACycles) | ||
return false; | ||
if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) { | ||
if (hasReadWriteDependencies(BeginMI, Instr)) | ||
return false; | ||
|
||
// If it is a packed instruction, we should subtract it's latency from the | ||
// overall latency calculation here, because the packed instruction will | ||
// be removed and replaced by 2 unpacked instructions | ||
TotalCyclesBetweenCandidates -= | ||
SchedModel.getWriteProcResBegin(InstrSchedClassDesc)->ReleaseAtCycle; | ||
// We're adding 2 to account for the extra latency added by unpacking into | ||
// 2 instructions. At the time of writing, the considered unpacked | ||
// instructions have latency of 1. | ||
// TODO: improve latency handling of possible inserted instructions | ||
TotalCyclesBetweenCandidates += 2; | ||
if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1)) | ||
InstrsToUnpack.insert(&Instr); | ||
} | ||
} | ||
return true; | ||
} | ||
|
||
void GCNPreRAOptimizationsImpl::insertUnpackedF32MI(MachineInstr &I, | ||
bool IsVreg_64, | ||
bool IsFMA) { | ||
MachineBasicBlock &MBB = *I.getParent(); | ||
const DebugLoc &DL = I.getDebugLoc(); | ||
Register DstReg = I.getOperand(0).getReg(); | ||
|
||
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I); | ||
if (UnpackedOpcode == std::numeric_limits<uint16_t>::max()) | ||
return; | ||
|
||
MachineInstrBuilder Op0L_Op1L = createUnpackedMI( | ||
MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/false, /*IsFMA=*/IsFMA); | ||
if (IsVreg_64 || I.getOperand(0).isUndef()) | ||
Op0L_Op1L->getOperand(0).setIsUndef(); | ||
LIS->InsertMachineInstrInMaps(*Op0L_Op1L); | ||
|
||
MachineInstrBuilder Op0H_Op1H = createUnpackedMI( | ||
MBB, I, DL, UnpackedOpcode, /*IsHiBits=*/true, /*IsFMA=*/IsFMA); | ||
LIS->InsertMachineInstrInMaps(*Op0H_Op1H); | ||
|
||
if (I.getFlag(MachineInstr::MIFlag::NoFPExcept)) { | ||
Op0L_Op1L->setFlag(MachineInstr::MIFlag::NoFPExcept); | ||
Op0H_Op1H->setFlag(MachineInstr::MIFlag::NoFPExcept); | ||
} | ||
if (I.getFlag(MachineInstr::MIFlag::FmContract)) { | ||
Op0L_Op1L->setFlag(MachineInstr::MIFlag::FmContract); | ||
Op0H_Op1H->setFlag(MachineInstr::MIFlag::FmContract); | ||
} | ||
|
||
LIS->RemoveMachineInstrFromMaps(I); | ||
I.eraseFromParent(); | ||
LIS->removeInterval(DstReg); | ||
LIS->createAndComputeVirtRegInterval(DstReg); | ||
return; | ||
} | ||
|
||
void GCNPreRAOptimizationsImpl::addOperandandMods(MachineInstrBuilder NewMI, | ||
unsigned Src_Mods, | ||
unsigned NegModifier, | ||
unsigned OpSelModifier, | ||
MachineOperand &SrcMO) { | ||
unsigned New_Src_Mods = 0; | ||
// If NEG or NEG_HI is true, we need to negate the corresponding 32 bit | ||
// lane. | ||
// NEG_HI shares the same bit position with ABS. But packed instructions do | ||
// not support ABS. Therefore, NEG_HI must be translated to NEG source | ||
// modifier for the higher 32 bits. Unpacked VOP3 instructions do support | ||
// ABS, therefore we need to explicitly add the NEG modifier if present in | ||
// the packed instruction | ||
if (Src_Mods & NegModifier) { | ||
New_Src_Mods |= SISrcMods::NEG; | ||
} | ||
// Src modifiers. Only negative modifiers are added if needed. Unpacked | ||
// operations do not have op_sel, therefore it must be handled explicitly as | ||
// done below. Unpacked operations support abs, but packed instructions do | ||
// not. Thus, abs is not handled. | ||
NewMI.addImm(New_Src_Mods); | ||
if (SrcMO.isImm()) { | ||
NewMI.addImm(SrcMO.getImm()); | ||
} else { | ||
if (Src_Mods & OpSelModifier) { | ||
unsigned Src0SubIdx = | ||
TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub1); | ||
NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); // src0 | ||
} else { | ||
unsigned Src0SubIdx = | ||
TRI->composeSubRegIndices(SrcMO.getSubReg(), AMDGPU::sub0); | ||
// If op_sel == 0, select register 0 of reg:sub0_sub1 | ||
NewMI.addReg(SrcMO.getReg(), 0, Src0SubIdx); | ||
} | ||
} | ||
} | ||
|
||
MachineInstrBuilder GCNPreRAOptimizationsImpl::createUnpackedMI( | ||
MachineBasicBlock &MBB, MachineInstr &I, const DebugLoc &DL, | ||
uint16_t UnpackedOpcode, bool IsHiBits, bool IsFMA) { | ||
MachineOperand &DstMO = I.getOperand(0); | ||
MachineOperand &SrcMO1 = I.getOperand(2); | ||
MachineOperand &SrcMO2 = I.getOperand(4); | ||
Register DstReg = DstMO.getReg(); | ||
unsigned DestSubIdx = | ||
IsHiBits ? TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub1) | ||
: TRI->composeSubRegIndices(DstMO.getSubReg(), AMDGPU::sub0); | ||
int ClampIdx = | ||
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::clamp); | ||
int64_t ClampVal = I.getOperand(ClampIdx).getImm(); | ||
int Src0_modifiers_Idx = | ||
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src0_modifiers); | ||
int Src1_modifiers_Idx = | ||
AMDGPU::getNamedOperandIdx(I.getOpcode(), AMDGPU::OpName::src1_modifiers); | ||
|
||
unsigned Src0_Mods = I.getOperand(Src0_modifiers_Idx).getImm(); | ||
unsigned Src1_Mods = I.getOperand(Src1_modifiers_Idx).getImm(); | ||
// Packed instructions (VOP3P) do not support abs. It is okay to ignore them. | ||
unsigned New_Src0_Mods = 0; | ||
unsigned New_Src1_Mods = 0; | ||
|
||
unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG; | ||
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0; | ||
|
||
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode)); | ||
NewMI.addDef(DstReg, 0, DestSubIdx); // vdst | ||
addOperandandMods(NewMI, Src0_Mods, NegModifier, OpSelModifier, SrcMO1); | ||
addOperandandMods(NewMI, Src1_Mods, NegModifier, OpSelModifier, SrcMO2); | ||
|
||
if (IsFMA) { | ||
MachineOperand &SrcMO3 = I.getOperand(6); | ||
int Src2_modifiers_Idx = AMDGPU::getNamedOperandIdx( | ||
I.getOpcode(), AMDGPU::OpName::src2_modifiers); | ||
unsigned Src2_Mods = I.getOperand(Src2_modifiers_Idx).getImm(); | ||
addOperandandMods(NewMI, Src2_Mods, NegModifier, OpSelModifier, SrcMO3); | ||
} | ||
NewMI.addImm(ClampVal); // clamp | ||
// Packed instructions do not support output modifiers. safe to assign them 0 | ||
// for this use case | ||
NewMI.addImm(0); // omod | ||
return NewMI; | ||
} | ||
|
||
void GCNPreRAOptimizationsImpl::processF32Unpacking(MachineInstr &I) { | ||
bool IsFMA = (I.getOpcode() == AMDGPU::V_PK_FMA_F32) ? true : false; | ||
const TargetRegisterClass *DstRC = MRI->getRegClass(I.getOperand(0).getReg()); | ||
bool IsVReg64 = (DstRC->getID() == AMDGPU::VReg_64_Align2RegClassID); | ||
insertUnpackedF32MI(I, IsVReg64, IsFMA); | ||
return; | ||
} | ||
|
||
bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) { | ||
if (skipFunction(MF.getFunction())) | ||
return false; | ||
|
@@ -260,38 +523,57 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) { | |
Changed |= processReg(Reg); | ||
} | ||
|
||
if (!ST.useRealTrue16Insts()) | ||
return Changed; | ||
|
||
// Add RA hints to improve True16 COPY elimination. | ||
for (const MachineBasicBlock &MBB : MF) { | ||
for (const MachineInstr &MI : MBB) { | ||
if (MI.getOpcode() != AMDGPU::COPY) | ||
continue; | ||
Register Dst = MI.getOperand(0).getReg(); | ||
Register Src = MI.getOperand(1).getReg(); | ||
if (Dst.isVirtual() && | ||
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && | ||
Src.isPhysical() && | ||
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) | ||
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); | ||
if (Src.isVirtual() && | ||
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && | ||
Dst.isPhysical() && | ||
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) | ||
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); | ||
if (!Dst.isVirtual() || !Src.isVirtual()) | ||
continue; | ||
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && | ||
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { | ||
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); | ||
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); | ||
// Unpack packed instructions to overlap MFMAs. This allows the compiler to | ||
// co-issue unpacked instructions with MFMA | ||
for (MachineBasicBlock &MBB : MF) { | ||
SetVector<MachineInstr *> InstrsToUnpack; | ||
SetVector<MachineOperand *> WriteOperands; | ||
SetVector<MachineOperand *> ReadOperands; | ||
uint16_t NumMFMACycles = 0; | ||
auto SchedModel = TII->getSchedModel(); | ||
for (MachineInstr &MI : MBB) { | ||
if (SIInstrInfo::isMFMA(MI)) { | ||
const MCSchedClassDesc *SchedClassDesc = | ||
SchedModel.resolveSchedClass(&MI); | ||
NumMFMACycles = | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we reduce for the issue latency of the MFMA There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ping |
||
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle; | ||
createListOfPackedInstr(MI, InstrsToUnpack, NumMFMACycles); | ||
} | ||
if (ST.useRealTrue16Insts()) { | ||
if (MI.getOpcode() != AMDGPU::COPY) | ||
continue; | ||
Register Dst = MI.getOperand(0).getReg(); | ||
Register Src = MI.getOperand(1).getReg(); | ||
if (Dst.isVirtual() && | ||
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && | ||
Src.isPhysical() && | ||
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass) | ||
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16)); | ||
if (Src.isVirtual() && | ||
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass && | ||
Dst.isPhysical() && | ||
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass) | ||
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16)); | ||
if (!Dst.isVirtual() || !Src.isVirtual()) | ||
continue; | ||
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass && | ||
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) { | ||
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src); | ||
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst); | ||
} | ||
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && | ||
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) | ||
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); | ||
} | ||
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass && | ||
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass) | ||
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src); | ||
} | ||
} | ||
|
||
if (!InstrsToUnpack.empty()) { | ||
for (MachineInstr *MI : InstrsToUnpack) { | ||
processF32Unpacking(*MI); | ||
} | ||
} | ||
} | ||
LIS->reanalyze(MF); | ||
return Changed; | ||
} | ||
} |
Uh oh!
There was an error while loading. Please reload this page.