2828// / and a VGPR_16. If we use the VGPR_16 that corresponds to the lo16 bits of
2929// / the VGPR_32, the COPY can be completely eliminated.
3030// /
31+ // / Additionally, this pass also unpacks packed instructions (V_PK_MUL_F32 and V_PK_ADD_F32)
32+ // / adjacent to MFMAs such that they can be co-issued.
33+ // / This helps with overlapping MFMA and certain vector instructions in machine schedules
34+ // / and is expected to improve performance.
35+ // / Only those packed instructions are unpacked that are overlapped by the MFMA latency.
36+ // / Rest should remain untouched.
3137// ===----------------------------------------------------------------------===//
3238
3339#include " GCNPreRAOptimizations.h"
3844#include " llvm/CodeGen/LiveIntervals.h"
3945#include " llvm/CodeGen/MachineFunctionPass.h"
4046#include " llvm/InitializePasses.h"
41-
47+ # include " llvm/ADT/DenseSet.h "
4248#include " SIInstrInfo.h"
4349#include " llvm/CodeGen/RegisterScavenging.h"
4450#include " llvm/InitializePasses.h"
45- #include < unordered_set>
46-
4751#include " GCNSchedStrategy.h"
4852#include " llvm/CodeGen/MachineInstr.h"
4953#include " llvm/CodeGen/MachineScheduler.h"
@@ -61,11 +65,10 @@ class GCNPreRAOptimizationsImpl {
6165 LiveIntervals *LIS;
6266
6367 bool processReg (Register Reg);
64- bool unpackInsts (MachineFunction &MF);
65- bool createListOfPackedInstr (MachineInstr &BeginMI, std::unordered_set<MachineInstr *> &seen);
66- bool isNeverCoissue (MachineInstr &MI, MachineFunction *MF) const ;
68+ bool createListOfPackedInstr (MachineInstr &BeginMI, DenseSet<MachineInstr *> &instrsToUnpack);
6769 bool isUnpackingSupportedInstr (MachineInstr &MI) const ;
6870 void insertMI (MachineInstr &I);
71+ uint16_t mapToUnpackedOpcode (MachineInstr &I);
6972 SmallVector<MachineInstr *, 2 > copyToVregAndInsertMI (MachineInstr &I,
7073 unsigned SGPRSrcPos);
7174 SmallVector<MachineInstr *, 2 >
@@ -244,80 +247,28 @@ bool GCNPreRAOptimizationsImpl::processReg(Register Reg) {
244247 return true ;
245248}
246249
247- bool GCNPreRAOptimizationsImpl::isNeverCoissue (MachineInstr &MI, MachineFunction *MF) const {
248- const GCNSubtarget &ST = MF->getSubtarget <GCNSubtarget>();
249- // bool IsGFX942Only = ST.hasGFX940Insts() && !ST.hasGFX950Insts();
250- // if (!IsGFX942Only)
251- // return false;
252-
253- if (!SIInstrInfo::isVALU (MI)){
254- return false ;
255- }
256-
257-
258- // V_COS, V_EXP, V_RCP, etc.
259- if (SIInstrInfo::isTRANS (MI))
260- return true ;
261-
262- // DOT2, DOT2C, DOT4, etc.
263- if (SIInstrInfo::isDOT (MI))
264- return true ;
265-
266- // MFMA, SMFMA
267- if (SIInstrInfo::isMFMA (MI))
268- return true ;
269-
250+ bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr (MachineInstr &MI) const {
270251 unsigned Opcode = MI.getOpcode ();
271252 switch (Opcode) {
272- case AMDGPU::V_CVT_PK_BF8_F32_e64:
273- case AMDGPU::V_CVT_PK_FP8_F32_e64:
274- case AMDGPU::V_MQSAD_PK_U16_U8_e64:
275- case AMDGPU::V_MQSAD_U32_U8_e64:
276- case AMDGPU::V_PK_ADD_F16:
277- case AMDGPU::V_PK_ADD_F32:
278- case AMDGPU::V_PK_ADD_I16:
279- case AMDGPU::V_PK_ADD_U16:
280- case AMDGPU::V_PK_ASHRREV_I16:
281- case AMDGPU::V_PK_FMA_F16:
282- case AMDGPU::V_PK_FMA_F32:
283- case AMDGPU::V_PK_FMAC_F16_e32:
284- case AMDGPU::V_PK_FMAC_F16_e64:
285- case AMDGPU::V_PK_LSHLREV_B16:
286- case AMDGPU::V_PK_LSHRREV_B16:
287- case AMDGPU::V_PK_MAD_I16:
288- case AMDGPU::V_PK_MAD_U16:
289- case AMDGPU::V_PK_MAX_F16:
290- case AMDGPU::V_PK_MAX_I16:
291- case AMDGPU::V_PK_MAX_U16:
292- case AMDGPU::V_PK_MIN_F16:
293- case AMDGPU::V_PK_MIN_I16:
294- case AMDGPU::V_PK_MIN_U16:
295- case AMDGPU::V_PK_MOV_B32:
296- case AMDGPU::V_PK_MUL_F16:
297- case AMDGPU::V_PK_MUL_F32:
298- case AMDGPU::V_PK_MUL_LO_U16:
299- case AMDGPU::V_PK_SUB_I16:
300- case AMDGPU::V_PK_SUB_U16:
301- case AMDGPU::V_QSAD_PK_U16_U8_e64:
302- return true ;
303-
304- default :
305- return false ;
253+ case AMDGPU::V_PK_ADD_F32:
254+ case AMDGPU::V_PK_MUL_F32:
255+ return true ;
256+
257+ default :
258+ return false ;
306259
307260 }
308261}
309262
310- bool GCNPreRAOptimizationsImpl::isUnpackingSupportedInstr (MachineInstr &MI) const {
311- unsigned Opcode = MI .getOpcode ();
263+ uint16_t GCNPreRAOptimizationsImpl::mapToUnpackedOpcode (MachineInstr &I) {
264+ unsigned Opcode = I .getOpcode ();
312265 switch (Opcode) {
313- case AMDGPU::V_PK_ADD_F16:
314- case AMDGPU::V_PK_ADD_F32:
315- case AMDGPU::V_PK_MUL_F16:
316- case AMDGPU::V_PK_MUL_F32:
317- return true ;
318-
319- default :
320- return false ;
266+ case AMDGPU::V_PK_ADD_F32:
267+ return AMDGPU::V_ADD_F32_e64;
268+ case AMDGPU::V_PK_MUL_F32:
269+ return AMDGPU::V_MUL_F32_e64;
270+ default :
271+ return std::numeric_limits<uint16_t >::max ();
321272
322273 }
323274}
@@ -358,7 +309,7 @@ GCNPreRAOptimizationsImpl::copyToVregAndInsertMI(MachineInstr &I,
358309}
359310
360311bool GCNPreRAOptimizationsImpl::createListOfPackedInstr (
361- MachineInstr &BeginMI, std::unordered_set <MachineInstr *> &seen ) {
312+ MachineInstr &BeginMI, DenseSet <MachineInstr *> &instrsToUnpack ) {
362313 auto *BB = BeginMI.getParent ();
363314 auto *MF = BB->getParent ();
364315 int NumInst = 0 ;
@@ -377,13 +328,13 @@ bool GCNPreRAOptimizationsImpl::createListOfPackedInstr(
377328
378329 if (Instr.isTerminator ())
379330 return false ;
380-
331+
381332 if (totalCyclesBetweenCandidates > NumMFMACycles)
382333 return false ;
383334
384- if ((Instr. getOpcode () == AMDGPU::V_PK_MUL_F32 ) && isNeverCoissue (Instr, Instr. getParent ()-> getParent () )) {
335+ if ((isUnpackingSupportedInstr (Instr) ) && TII-> isNeverCoissue (Instr )) {
385336 totalCyclesBetweenCandidates += 1 ;
386- seen .insert (&Instr);
337+ instrsToUnpack .insert (&Instr);
387338 }
388339 }
389340 return true ;
@@ -420,8 +371,8 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
420371 // don't worry about abs values. Packed instructions (VOP3P) do not support them
421372 unsigned Lo_src0_mods = 0 ;
422373 unsigned Lo_src1_mods = 0 ;
423-
424- MachineInstrBuilder Op0L_Op1L = BuildMI (MBB, I, DL, TII->get (AMDGPU::V_MUL_F32_e64 ));
374+ uint16_t unpackedOpcode = mapToUnpackedOpcode (I);
375+ MachineInstrBuilder Op0L_Op1L = BuildMI (MBB, I, DL, TII->get (unpackedOpcode ));
425376 Op0L_Op1L.addDef (DstReg, 0 , DestSubIdx); // vdst
426377 if (src0_Mods & SISrcMods::OP_SEL_0) {
427378 if (src0_Mods & SISrcMods::NEG) {
@@ -476,7 +427,7 @@ SmallVector<MachineInstr *, 2> GCNPreRAOptimizationsImpl::insertUnpackedMI(
476427 unsigned Hi_src0_mods = 0 ;
477428 unsigned Hi_src1_mods = 0 ;
478429
479- MachineInstrBuilder Op0H_Op1H = BuildMI (MBB, I, DL, TII->get (AMDGPU::V_MUL_F32_e64 ));
430+ MachineInstrBuilder Op0H_Op1H = BuildMI (MBB, I, DL, TII->get (unpackedOpcode ));
480431 Op0H_Op1H.addDef (DstReg, 0 , DestSubIdx); // vdst
481432 if (src0_Mods & SISrcMods::OP_SEL_1) {
482433 if (src0_Mods & SISrcMods::NEG_HI) {
@@ -600,29 +551,6 @@ void GCNPreRAOptimizationsImpl::insertMI(MachineInstr &I) {
600551 return ;
601552}
602553
603- bool GCNPreRAOptimizationsImpl::unpackInsts (MachineFunction &MF) {
604-
605- const GCNSubtarget &ST = MF.getSubtarget <GCNSubtarget>();
606- TII = ST.getInstrInfo ();
607- TRI = &TII->getRegisterInfo ();
608-
609- auto schedModel = TII->getSchedModel ();
610- for (MachineBasicBlock &MBB : MF) {
611- std::unordered_set<MachineInstr *> seen;
612- for (MachineInstr &MI : MBB) {
613- if (SIInstrInfo::isMFMA (MI)){
614- createListOfPackedInstr (MI, seen);
615- }
616-
617- }
618- if (!seen.empty ()) {
619- for (MachineInstr *MI : seen)
620- insertMI (*MI);
621- }
622- }
623- return true ;
624- }
625-
626554bool GCNPreRAOptimizationsLegacy::runOnMachineFunction (MachineFunction &MF) {
627555 if (skipFunction (MF.getFunction ()))
628556 return false ;
@@ -646,7 +574,6 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
646574
647575 bool Changed = false ;
648576
649- Changed = unpackInsts (MF);
650577 for (unsigned I = 0 , E = MRI->getNumVirtRegs (); I != E; ++I) {
651578 Register Reg = Register::index2VirtReg (I);
652579 if (!LIS->hasInterval (Reg))
@@ -659,38 +586,46 @@ bool GCNPreRAOptimizationsImpl::run(MachineFunction &MF) {
659586 Changed |= processReg (Reg);
660587 }
661588
662- if (!ST.useRealTrue16Insts ())
663- return Changed;
664-
665589 // Add RA hints to improve True16 COPY elimination.
666- for (const MachineBasicBlock &MBB : MF) {
667- for (const MachineInstr &MI : MBB) {
668- if (MI.getOpcode () != AMDGPU::COPY)
669- continue ;
670- Register Dst = MI.getOperand (0 ).getReg ();
671- Register Src = MI.getOperand (1 ).getReg ();
672- if (Dst.isVirtual () &&
673- MRI->getRegClass (Dst) == &AMDGPU::VGPR_16RegClass &&
674- Src.isPhysical () &&
675- TRI->getRegClassForReg (*MRI, Src) == &AMDGPU::VGPR_32RegClass)
676- MRI->setRegAllocationHint (Dst, 0 , TRI->getSubReg (Src, AMDGPU::lo16));
677- if (Src.isVirtual () &&
678- MRI->getRegClass (Src) == &AMDGPU::VGPR_16RegClass &&
679- Dst.isPhysical () &&
680- TRI->getRegClassForReg (*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
681- MRI->setRegAllocationHint (Src, 0 , TRI->getSubReg (Dst, AMDGPU::lo16));
682- if (!Dst.isVirtual () || !Src.isVirtual ())
683- continue ;
684- if (MRI->getRegClass (Dst) == &AMDGPU::VGPR_32RegClass &&
685- MRI->getRegClass (Src) == &AMDGPU::VGPR_16RegClass) {
686- MRI->setRegAllocationHint (Dst, AMDGPURI::Size32, Src);
687- MRI->setRegAllocationHint (Src, AMDGPURI::Size16, Dst);
590+ // Unpack packed instructions to overlap MFMAs. This allows the compiler to co-issue unpacked instructions with MFMA
591+ for (MachineBasicBlock &MBB : MF) {
592+ DenseSet<MachineInstr *> instrsToUnpack;
593+ for (MachineInstr &MI : MBB) {
594+ if (SIInstrInfo::isMFMA (MI)){
595+ createListOfPackedInstr (MI, instrsToUnpack);
596+ }
597+ if (ST.useRealTrue16Insts ()){
598+ if (MI.getOpcode () != AMDGPU::COPY)
599+ continue ;
600+ Register Dst = MI.getOperand (0 ).getReg ();
601+ Register Src = MI.getOperand (1 ).getReg ();
602+ if (Dst.isVirtual () &&
603+ MRI->getRegClass (Dst) == &AMDGPU::VGPR_16RegClass &&
604+ Src.isPhysical () &&
605+ TRI->getRegClassForReg (*MRI, Src) == &AMDGPU::VGPR_32RegClass)
606+ MRI->setRegAllocationHint (Dst, 0 , TRI->getSubReg (Src, AMDGPU::lo16));
607+ if (Src.isVirtual () &&
608+ MRI->getRegClass (Src) == &AMDGPU::VGPR_16RegClass &&
609+ Dst.isPhysical () &&
610+ TRI->getRegClassForReg (*MRI, Dst) == &AMDGPU::VGPR_32RegClass)
611+ MRI->setRegAllocationHint (Src, 0 , TRI->getSubReg (Dst, AMDGPU::lo16));
612+ if (!Dst.isVirtual () || !Src.isVirtual ())
613+ continue ;
614+ if (MRI->getRegClass (Dst) == &AMDGPU::VGPR_32RegClass &&
615+ MRI->getRegClass (Src) == &AMDGPU::VGPR_16RegClass) {
616+ MRI->setRegAllocationHint (Dst, AMDGPURI::Size32, Src);
617+ MRI->setRegAllocationHint (Src, AMDGPURI::Size16, Dst);
618+ }
619+ if (MRI->getRegClass (Dst) == &AMDGPU::VGPR_16RegClass &&
620+ MRI->getRegClass (Src) == &AMDGPU::VGPR_32RegClass)
621+ MRI->setRegAllocationHint (Dst, AMDGPURI::Size16, Src);
688622 }
689- if (MRI->getRegClass (Dst) == &AMDGPU::VGPR_16RegClass &&
690- MRI->getRegClass (Src) == &AMDGPU::VGPR_32RegClass)
691- MRI->setRegAllocationHint (Dst, AMDGPURI::Size16, Src);
623+ }
624+
625+ if (!instrsToUnpack.empty ()) {
626+ for (MachineInstr *MI : instrsToUnpack)
627+ insertMI (*MI);
692628 }
693629 }
694-
695630 return Changed;
696631}
0 commit comments