@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
4747 const MachineBasicBlock &From,
4848 const MachineBasicBlock &To) const ;
4949 bool removeExeczBranch (MachineInstr &MI, MachineBasicBlock &SrcMBB);
50- // Check if the machine instruction being processed is a supported packed
51- // instruction.
52- bool isUnpackingSupportedInstr (MachineInstr &MI) const ;
5350 // Creates a list of packed instructions following an MFMA that are suitable
5451 // for unpacking.
5552 void collectUnpackingCandidates (MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
454451 return true ;
455452}
456453
457- // If support is extended to new operations, add tests in
458- // llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
459- bool SIPreEmitPeephole::isUnpackingSupportedInstr (MachineInstr &MI) const {
460- if (!TII->isNeverCoissue (MI))
461- return false ;
462- unsigned Opcode = MI.getOpcode ();
463- switch (Opcode) {
464- case AMDGPU::V_PK_ADD_F32:
465- case AMDGPU::V_PK_MUL_F32:
466- case AMDGPU::V_PK_FMA_F32:
467- return true ;
468- default :
469- return false ;
470- }
471- llvm_unreachable (" Fully covered switch" );
472- }
473-
474454bool SIPreEmitPeephole::canUnpackingClobberRegister (const MachineInstr &MI) {
475455 unsigned OpCode = MI.getOpcode ();
476456 Register DstReg = MI.getOperand (0 ).getReg ();
@@ -612,10 +592,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
612592
613593 for (auto I = std::next (BeginMI.getIterator ()); I != E; ++I) {
614594 MachineInstr &Instr = *I;
595+ uint16_t UnpackedOpCode = mapToUnpackedOpcode (Instr);
615596 if (Instr.isMetaInstruction ())
616597 continue ;
617598 if ((Instr.isTerminator ()) ||
618- (TII->isNeverCoissue (Instr) && !isUnpackingSupportedInstr (Instr)) ||
599+ (TII->isNeverCoissue (Instr) &&
600+ (UnpackedOpCode == std::numeric_limits<uint16_t >::max ())) ||
619601 (SIInstrInfo::modifiesModeRegister (Instr) &&
620602 Instr.modifiesRegister (AMDGPU::EXEC, TRI)))
621603 return ;
@@ -639,7 +621,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
639621 if (TRI->regsOverlap (MFMADef, InstrMO.getReg ()))
640622 return ;
641623 }
642- if (! isUnpackingSupportedInstr (Instr ))
624+ if (UnpackedOpCode == std::numeric_limits< uint16_t >:: max ( ))
643625 continue ;
644626
645627 if (canUnpackingClobberRegister (Instr))
@@ -687,8 +669,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
687669 bool IsHiBits) {
688670 MachineBasicBlock &MBB = *I.getParent ();
689671 const DebugLoc &DL = I.getDebugLoc ();
690- const MachineOperand *SrcMO1 = TII->getNamedOperand (I, AMDGPU::OpName::src0);
691- const MachineOperand *SrcMO2 = TII->getNamedOperand (I, AMDGPU::OpName::src1);
672+ const MachineOperand *SrcMO0 = TII->getNamedOperand (I, AMDGPU::OpName::src0);
673+ const MachineOperand *SrcMO1 = TII->getNamedOperand (I, AMDGPU::OpName::src1);
692674 Register DstReg = I.getOperand (0 ).getReg ();
693675 unsigned OpCode = I.getOpcode ();
694676 Register UnpackedDstReg = IsHiBits ? TRI->getSubReg (DstReg, AMDGPU::sub1)
@@ -702,15 +684,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
702684
703685 MachineInstrBuilder NewMI = BuildMI (MBB, I, DL, TII->get (UnpackedOpcode));
704686 NewMI.addDef (UnpackedDstReg); // vdst
705- addOperandAndMods (NewMI, Src0Mods, IsHiBits, *SrcMO1 );
706- addOperandAndMods (NewMI, Src1Mods, IsHiBits, *SrcMO2 );
687+ addOperandAndMods (NewMI, Src0Mods, IsHiBits, *SrcMO0 );
688+ addOperandAndMods (NewMI, Src1Mods, IsHiBits, *SrcMO1 );
707689
708690 if (AMDGPU::hasNamedOperand (OpCode, AMDGPU::OpName::src2)) {
709- const MachineOperand *SrcMO3 =
691+ const MachineOperand *SrcMO2 =
710692 TII->getNamedOperand (I, AMDGPU::OpName::src2);
711693 unsigned Src2Mods =
712694 TII->getNamedOperand (I, AMDGPU::OpName::src2_modifiers)->getImm ();
713- addOperandAndMods (NewMI, Src2Mods, IsHiBits, *SrcMO3 );
695+ addOperandAndMods (NewMI, Src2Mods, IsHiBits, *SrcMO2 );
714696 }
715697 NewMI.addImm (ClampVal); // clamp
716698 // Packed instructions do not support output modifiers. safe to assign them 0
@@ -787,22 +769,26 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
787769
788770 // TODO: Fold this into previous block, if possible. Evaluate and handle any
789771 // side effects.
790- for (MachineBasicBlock &MBB : MF) {
791- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
792- // to co-issue unpacked instructions with MFMA
793- auto SchedModel = TII->getSchedModel ();
794- SetVector<MachineInstr *> InstrsToUnpack;
795- for (auto &MI : make_early_inc_range (MBB.instrs ())) {
796- if (!SIInstrInfo::isMFMA (MI))
797- continue ;
798- const MCSchedClassDesc *SchedClassDesc =
799- SchedModel.resolveSchedClass (&MI);
800- uint16_t NumMFMACycles =
801- SchedModel.getWriteProcResBegin (SchedClassDesc)->ReleaseAtCycle ;
802- collectUnpackingCandidates (MI, InstrsToUnpack, NumMFMACycles);
803- }
804- for (MachineInstr *MI : InstrsToUnpack) {
805- performF32Unpacking (*MI);
772+
773+ // Perform the extra MF scans only for supported archs
774+ if (ST.hasGFX950Insts () || ST.hasGFX940Insts ()) {
775+ for (MachineBasicBlock &MBB : MF) {
776+ // Unpack packed instructions overlapped by MFMAs. This allows the compiler
777+ // to co-issue unpacked instructions with MFMA
778+ auto SchedModel = TII->getSchedModel ();
779+ SetVector<MachineInstr *> InstrsToUnpack;
780+ for (auto &MI : make_early_inc_range (MBB.instrs ())) {
781+ if (!SIInstrInfo::isMFMA (MI))
782+ continue ;
783+ const MCSchedClassDesc *SchedClassDesc =
784+ SchedModel.resolveSchedClass (&MI);
785+ uint16_t NumMFMACycles =
786+ SchedModel.getWriteProcResBegin (SchedClassDesc)->ReleaseAtCycle ;
787+ collectUnpackingCandidates (MI, InstrsToUnpack, NumMFMACycles);
788+ }
789+ for (MachineInstr *MI : InstrsToUnpack) {
790+ performF32Unpacking (*MI);
791+ }
806792 }
807793 }
808794
0 commit comments