Skip to content

Commit c581612

Browse files
committed
miscellaneous code optimizations and cleanup
1 parent d3b19c6 commit c581612

File tree

3 files changed

+127
-134
lines changed

3 files changed

+127
-134
lines changed

llvm/lib/Target/AMDGPU/GCNPreRAOptimizations.cpp

Lines changed: 68 additions & 133 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@
2828
/// and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
2929
/// the VGPR_32, the COPY can be completely eliminated.
3030
///
31+
/// Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32)
32+
/// adjacent to MFMAs such that they can be co-issued.
33+
/// This helps with overlapping MFMA and certain vector instructions in machine schedules
34+
/// and is expected to improve performance.
35+
/// Only those packed instructions are unpacked that are overlapped by the MFMA latency.
36+
/// Rest should remain untouched.
3137
//===----------------------------------------------------------------------===//
3238

3339
#include "GCNPreRAOptimizations.h"
@@ -38,12 +44,10 @@
3844
#include "llvm/CodeGen/LiveIntervals.h"
3945
#include "llvm/CodeGen/MachineFunctionPass.h"
4046
#include "llvm/InitializePasses.h"
41-
47+
#include "llvm/ADT/DenseSet.h"
4248
#include "SIInstrInfo.h"
4349
#include "llvm/CodeGen/RegisterScavenging.h"
4450
#include "llvm/InitializePasses.h"
45-
#include <unordered_set>
46-
4751
#include "GCNSchedStrategy.h"
4852
#include "llvm/CodeGen/MachineInstr.h"
4953
#include "llvm/CodeGen/MachineScheduler.h"
@@ -61,11 +65,10 @@ class GCNPreRAOptimizationsImpl {
6165
LiveIntervals *LIS;
6266

6367
bool processReg(Register Reg);
64-
bool unpackInsts(MachineFunction &MF);
65-
bool createListOfPackedInstr(MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
66-
bool isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const;
68+
bool createListOfPackedInstr(MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack);
6769
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
6870
void insertMI(MachineInstr &I);
71+
uint16_t mapToUnpackedOpcode(MachineInstr &I);
6972
SmallVector<MachineInstr *, 2> copyToVregAndInsertMI(MachineInstr &I,
7073
unsigned SGPRSrcPos);
7174
SmallVector<MachineInstr *, 2>
@@ -244,80 +247,28 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
244247
return true;
245248
}
246249

247-
bool GCNPreRAOptimizationsImpl::isNeverCoissue(MachineInstr &MI, MachineFunction *MF) const {
248-
const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
249-
// bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
250-
// if (!IsGFX942Only)
251-
// return false;
252-
253-
if (!SIInstrInfo::isVALU(MI)){
254-
return false;
255-
}
256-
257-
258-
// V_COS, V_EXP, V_RCP, etc.
259-
if (SIInstrInfo::isTRANS(MI))
260-
return true;
261-
262-
// DOT2, DOT2C, DOT4, etc.
263-
if (SIInstrInfo::isDOT(MI))
264-
return true;
265-
266-
// MFMA, SMFMA
267-
if (SIInstrInfo::isMFMA(MI))
268-
return true;
269-
250+
bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
270251
unsigned Opcode = MI.getOpcode();
271252
switch (Opcode) {
272-
case AMDGPU::V_CVT_PK_BF8_F32_e64:
273-
case AMDGPU::V_CVT_PK_FP8_F32_e64:
274-
case AMDGPU::V_MQSAD_PK_U16_U8_e64:
275-
case AMDGPU::V_MQSAD_U32_U8_e64:
276-
case AMDGPU::V_PK_ADD_F16:
277-
case AMDGPU::V_PK_ADD_F32:
278-
case AMDGPU::V_PK_ADD_I16:
279-
case AMDGPU::V_PK_ADD_U16:
280-
case AMDGPU::V_PK_ASHRREV_I16:
281-
case AMDGPU::V_PK_FMA_F16:
282-
case AMDGPU::V_PK_FMA_F32:
283-
case AMDGPU::V_PK_FMAC_F16_e32:
284-
case AMDGPU::V_PK_FMAC_F16_e64:
285-
case AMDGPU::V_PK_LSHLREV_B16:
286-
case AMDGPU::V_PK_LSHRREV_B16:
287-
case AMDGPU::V_PK_MAD_I16:
288-
case AMDGPU::V_PK_MAD_U16:
289-
case AMDGPU::V_PK_MAX_F16:
290-
case AMDGPU::V_PK_MAX_I16:
291-
case AMDGPU::V_PK_MAX_U16:
292-
case AMDGPU::V_PK_MIN_F16:
293-
case AMDGPU::V_PK_MIN_I16:
294-
case AMDGPU::V_PK_MIN_U16:
295-
case AMDGPU::V_PK_MOV_B32:
296-
case AMDGPU::V_PK_MUL_F16:
297-
case AMDGPU::V_PK_MUL_F32:
298-
case AMDGPU::V_PK_MUL_LO_U16:
299-
case AMDGPU::V_PK_SUB_I16:
300-
case AMDGPU::V_PK_SUB_U16:
301-
case AMDGPU::V_QSAD_PK_U16_U8_e64:
302-
return true;
303-
304-
default:
305-
return false;
253+
case AMDGPU::V_PK_ADD_F32:
254+
case AMDGPU::V_PK_MUL_F32:
255+
return true;
256+
257+
default:
258+
return false;
306259

307260
}
308261
}
309262

310-
bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr(MachineInstr &MI) const {
311-
unsigned Opcode = MI.getOpcode();
263+
uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode(MachineInstr &I) {
264+
unsigned Opcode = I.getOpcode();
312265
switch (Opcode) {
313-
case AMDGPU::V_PK_ADD_F16:
314-
case AMDGPU::V_PK_ADD_F32:
315-
case AMDGPU::V_PK_MUL_F16:
316-
case AMDGPU::V_PK_MUL_F32:
317-
return true;
318-
319-
default:
320-
return false;
266+
case AMDGPU::V_PK_ADD_F32:
267+
return AMDGPU::V_ADD_F32_e64;
268+
case AMDGPU::V_PK_MUL_F32:
269+
return AMDGPU::V_MUL_F32_e64;
270+
default:
271+
return std::numeric_limits<uint16_t>::max();
321272

322273
}
323274
}
@@ -358,7 +309,7 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
358309
}
359310

360311
bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
361-
MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen) {
312+
MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack) {
362313
auto *BB = BeginMI.getParent();
363314
auto *MF = BB->getParent();
364315
int NumInst = 0;
@@ -377,13 +328,13 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
377328

378329
if (Instr.isTerminator())
379330
return false;
380-
331+
381332
if (totalCyclesBetweenCandidates > NumMFMACycles)
382333
return false;
383334

384-
if ((Instr.getOpcode() == AMDGPU::V_PK_MUL_F32) && isNeverCoissue(Instr, Instr.getParent()->getParent())) {
335+
if ((isUnpackingSupportedInstr(Instr)) && TII->isNeverCoissue(Instr)) {
385336
totalCyclesBetweenCandidates += 1;
386-
seen.insert(&Instr);
337+
instrsToUnpack.insert(&Instr);
387338
}
388339
}
389340
return true;
@@ -420,8 +371,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
420371
//don't worry about abs values. Packed instructions (VOP3P) do not support them
421372
unsigned Lo_src0_mods = 0;
422373
unsigned Lo_src1_mods = 0;
423-
424-
MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
374+
uint16_t unpackedOpcode = mapToUnpackedOpcode(I);
375+
MachineInstrBuilder Op0L_Op1L = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
425376
Op0L_Op1L.addDef(DstReg, 0, DestSubIdx); //vdst
426377
if (src0_Mods & SISrcMods::OP_SEL_0) {
427378
if (src0_Mods & SISrcMods::NEG) {
@@ -476,7 +427,7 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
476427
unsigned Hi_src0_mods = 0;
477428
unsigned Hi_src1_mods = 0;
478429

479-
MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MUL_F32_e64));
430+
MachineInstrBuilder Op0H_Op1H = BuildMI(MBB, I, DL, TII->get(unpackedOpcode));
480431
Op0H_Op1H.addDef(DstReg, 0, DestSubIdx); //vdst
481432
if (src0_Mods & SISrcMods::OP_SEL_1) {
482433
if (src0_Mods & SISrcMods::NEG_HI) {
@@ -600,29 +551,6 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
600551
return;
601552
}
602553

603-
bool GCNPreRAOptimizationsImpl::unpackInsts(MachineFunction &MF) {
604-
605-
const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
606-
TII = ST.getInstrInfo();
607-
TRI = &TII->getRegisterInfo();
608-
609-
auto schedModel = TII->getSchedModel();
610-
for (MachineBasicBlock &MBB : MF) {
611-
std::unordered_set<MachineInstr *> seen;
612-
for (MachineInstr &MI : MBB) {
613-
if (SIInstrInfo::isMFMA(MI)){
614-
createListOfPackedInstr(MI, seen);
615-
}
616-
617-
}
618-
if (!seen.empty()) {
619-
for (MachineInstr *MI : seen)
620-
insertMI(*MI);
621-
}
622-
}
623-
return true;
624-
}
625-
626554
bool GCNPreRAOptimizationsLegacy::runOnMachineFunction(MachineFunction &MF) {
627555
if (skipFunction(MF.getFunction()))
628556
return false;
@@ -646,7 +574,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
646574

647575
bool Changed = false;
648576

649-
Changed = unpackInsts(MF);
650577
for (unsigned I = 0, E = MRI->getNumVirtRegs(); I != E; ++I) {
651578
Register Reg = Register::index2VirtReg(I);
652579
if (!LIS->hasInterval(Reg))
@@ -659,38 +586,46 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
659586
Changed |= processReg(Reg);
660587
}
661588

662-
if (!ST.useRealTrue16Insts())
663-
return Changed;
664-
665589
// Add RA hints to improve True16 COPY elimination.
666-
for (const MachineBasicBlock &MBB : MF) {
667-
for (const MachineInstr &MI : MBB) {
668-
if (MI.getOpcode() != AMDGPU::COPY)
669-
continue;
670-
Register Dst = MI.getOperand(0).getReg();
671-
Register Src = MI.getOperand(1).getReg();
672-
if (Dst.isVirtual() &&
673-
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
674-
Src.isPhysical() &&
675-
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
676-
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
677-
if (Src.isVirtual() &&
678-
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
679-
Dst.isPhysical() &&
680-
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
681-
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
682-
if (!Dst.isVirtual() || !Src.isVirtual())
683-
continue;
684-
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
685-
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
686-
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
687-
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
590+
// Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
591+
for (MachineBasicBlock &MBB : MF) {
592+
DenseSet<MachineInstr *> instrsToUnpack;
593+
for (MachineInstr &MI : MBB) {
594+
if (SIInstrInfo::isMFMA(MI)){
595+
createListOfPackedInstr(MI, instrsToUnpack);
596+
}
597+
if (ST.useRealTrue16Insts()){
598+
if (MI.getOpcode() != AMDGPU::COPY)
599+
continue;
600+
Register Dst = MI.getOperand(0).getReg();
601+
Register Src = MI.getOperand(1).getReg();
602+
if (Dst.isVirtual() &&
603+
MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
604+
Src.isPhysical() &&
605+
TRI->getRegClassForReg(*MRI, Src) == &AMDGPU::VGPR_32RegClass)
606+
MRI->setRegAllocationHint(Dst, 0, TRI->getSubReg(Src, AMDGPU::lo16));
607+
if (Src.isVirtual() &&
608+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass &&
609+
Dst.isPhysical() &&
610+
TRI->getRegClassForReg(*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
611+
MRI->setRegAllocationHint(Src, 0, TRI->getSubReg(Dst, AMDGPU::lo16));
612+
if (!Dst.isVirtual() || !Src.isVirtual())
613+
continue;
614+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_32RegClass &&
615+
MRI->getRegClass(Src) == &AMDGPU::VGPR_16RegClass) {
616+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size32, Src);
617+
MRI->setRegAllocationHint(Src, AMDGPURI::Size16, Dst);
618+
}
619+
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
620+
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
621+
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
688622
}
689-
if (MRI->getRegClass(Dst) == &AMDGPU::VGPR_16RegClass &&
690-
MRI->getRegClass(Src) == &AMDGPU::VGPR_32RegClass)
691-
MRI->setRegAllocationHint(Dst, AMDGPURI::Size16, Src);
623+
}
624+
625+
if (!instrsToUnpack.empty()) {
626+
for (MachineInstr *MI : instrsToUnpack)
627+
insertMI(*MI);
692628
}
693629
}
694-
695630
return Changed;
696631
}

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 58 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,6 @@
1515
#include "AMDGPU.h"
1616
#include "AMDGPUInstrInfo.h"
1717
#include "GCNHazardRecognizer.h"
18-
#include "GCNSubtarget.h"
1918
#include "SIMachineFunctionInfo.h"
2019
#include "Utils/AMDGPUBaseInfo.h"
2120
#include "llvm/Analysis/ValueTracking.h"
@@ -6173,6 +6172,64 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx,
61736172
return isImmOperandLegal(MI, OpIdx, *MO);
61746173
}
61756174

6175+
bool SIInstrInfo::isNeverCoissue(MachineInstr &MI) const {
6176+
bool IsGFX950Only = ST.hasGFX950Insts();
6177+
if (!IsGFX950Only)
6178+
return false;
6179+
6180+
if (!isVALU(MI))
6181+
return false;
6182+
6183+
// V_COS, V_EXP, V_RCP, etc.
6184+
if (isTRANS(MI))
6185+
return true;
6186+
6187+
// DOT2, DOT2C, DOT4, etc.
6188+
if (isDOT(MI))
6189+
return true;
6190+
6191+
// MFMA, SMFMA
6192+
if (isMFMA(MI))
6193+
return true;
6194+
6195+
unsigned Opcode = MI.getOpcode();
6196+
switch (Opcode) {
6197+
case AMDGPU::V_CVT_PK_BF8_F32_e64:
6198+
case AMDGPU::V_CVT_PK_FP8_F32_e64:
6199+
case AMDGPU::V_MQSAD_PK_U16_U8_e64:
6200+
case AMDGPU::V_MQSAD_U32_U8_e64:
6201+
case AMDGPU::V_PK_ADD_F16:
6202+
case AMDGPU::V_PK_ADD_F32:
6203+
case AMDGPU::V_PK_ADD_I16:
6204+
case AMDGPU::V_PK_ADD_U16:
6205+
case AMDGPU::V_PK_ASHRREV_I16:
6206+
case AMDGPU::V_PK_FMA_F16:
6207+
case AMDGPU::V_PK_FMA_F32:
6208+
case AMDGPU::V_PK_FMAC_F16_e32:
6209+
case AMDGPU::V_PK_FMAC_F16_e64:
6210+
case AMDGPU::V_PK_LSHLREV_B16:
6211+
case AMDGPU::V_PK_LSHRREV_B16:
6212+
case AMDGPU::V_PK_MAD_I16:
6213+
case AMDGPU::V_PK_MAD_U16:
6214+
case AMDGPU::V_PK_MAX_F16:
6215+
case AMDGPU::V_PK_MAX_I16:
6216+
case AMDGPU::V_PK_MAX_U16:
6217+
case AMDGPU::V_PK_MIN_F16:
6218+
case AMDGPU::V_PK_MIN_I16:
6219+
case AMDGPU::V_PK_MIN_U16:
6220+
case AMDGPU::V_PK_MOV_B32:
6221+
case AMDGPU::V_PK_MUL_F16:
6222+
case AMDGPU::V_PK_MUL_F32:
6223+
case AMDGPU::V_PK_MUL_LO_U16:
6224+
case AMDGPU::V_PK_SUB_I16:
6225+
case AMDGPU::V_PK_SUB_U16:
6226+
case AMDGPU::V_QSAD_PK_U16_U8_e64:
6227+
return true;
6228+
default:
6229+
return false;
6230+
}
6231+
}
6232+
61766233
void SIInstrInfo::legalizeOperandsVOP2(MachineRegisterInfo &MRI,
61776234
MachineInstr &MI) const {
61786235
unsigned Opc = MI.getOpcode();

llvm/lib/Target/AMDGPU/SIInstrInfo.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1178,6 +1178,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
11781178
bool isImmOperandLegal(const MachineInstr &MI, unsigned OpNo,
11791179
const MachineOperand &MO) const;
11801180

1181+
bool isNeverCoissue(MachineInstr &MI) const;
11811182
/// Return true if this 64-bit VALU instruction has a 32-bit encoding.
11821183
/// This function will return false if you pass it a 32-bit instruction.
11831184
bool hasVALU32BitEncoding(unsigned Opcode) const;

0 commit comments

Comments
 (0)