Skip to content

Commit a7b24e0

Browse files
committed
optimize redundant condition, reduce MF scan overhead for unsupported archs
1 parent ed1716f commit a7b24e0

File tree

1 file changed

+30
-44
lines changed

1 file changed

+30
-44
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 30 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
4747
const MachineBasicBlock &From,
4848
const MachineBasicBlock &To) const;
4949
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
50-
// Check if the machine instruction being processed is a supported packed
51-
// instruction.
52-
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
5350
// Creates a list of packed instructions following an MFMA that are suitable
5451
// for unpacking.
5552
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
454451
return true;
455452
}
456453

457-
// If support is extended to new operations, add tests in
458-
// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
459-
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
460-
if (!TII->isNeverCoissue(MI))
461-
return false;
462-
unsigned Opcode = MI.getOpcode();
463-
switch (Opcode) {
464-
case AMDGPU::V_PK_ADD_F32:
465-
case AMDGPU::V_PK_MUL_F32:
466-
case AMDGPU::V_PK_FMA_F32:
467-
return true;
468-
default:
469-
return false;
470-
}
471-
llvm_unreachable("Fully covered switch");
472-
}
473-
474454
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
475455
unsigned OpCode = MI.getOpcode();
476456
Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
612592

613593
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
614594
MachineInstr &Instr = *I;
595+
uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
615596
if (Instr.isMetaInstruction())
616597
continue;
617598
if ((Instr.isTerminator()) ||
618-
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
599+
(TII->isNeverCoissue(Instr) &&
600+
(UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
619601
(SIInstrInfo::modifiesModeRegister(Instr) &&
620602
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
621603
return;
@@ -639,7 +621,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
639621
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
640622
return;
641623
}
642-
if (!isUnpackingSupportedInstr(Instr))
624+
if (UnpackedOpCode == std::numeric_limits<uint16_t>::max())
643625
continue;
644626

645627
if (canUnpackingClobberRegister(Instr))
@@ -687,8 +669,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
687669
bool IsHiBits) {
688670
MachineBasicBlock &MBB = *I.getParent();
689671
const DebugLoc &DL = I.getDebugLoc();
690-
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
691-
const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
672+
const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
673+
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
692674
Register DstReg = I.getOperand(0).getReg();
693675
unsigned OpCode = I.getOpcode();
694676
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +684,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
702684

703685
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
704686
NewMI.addDef(UnpackedDstReg); // vdst
705-
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
706-
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
687+
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
688+
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
707689

708690
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
709-
const MachineOperand *SrcMO3 =
691+
const MachineOperand *SrcMO2 =
710692
TII->getNamedOperand(I, AMDGPU::OpName::src2);
711693
unsigned Src2Mods =
712694
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
713-
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
695+
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
714696
}
715697
NewMI.addImm(ClampVal); // clamp
716698
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -787,22 +769,26 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
787769

788770
// TODO: Fold this into previous block, if possible. Evaluate and handle any
789771
// side effects.
790-
for (MachineBasicBlock &MBB : MF) {
791-
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
792-
// to co-issue unpacked instructions with MFMA
793-
auto SchedModel = TII->getSchedModel();
794-
SetVector<MachineInstr *> InstrsToUnpack;
795-
for (auto &MI : make_early_inc_range(MBB.instrs())) {
796-
if (!SIInstrInfo::isMFMA(MI))
797-
continue;
798-
const MCSchedClassDesc *SchedClassDesc =
799-
SchedModel.resolveSchedClass(&MI);
800-
uint16_t NumMFMACycles =
801-
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
802-
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
803-
}
804-
for (MachineInstr *MI : InstrsToUnpack) {
805-
performF32Unpacking(*MI);
772+
773+
// Perform the extra MF scans only for supported archs
774+
if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
775+
for (MachineBasicBlock &MBB : MF) {
776+
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
777+
// to co-issue unpacked instructions with MFMA
778+
auto SchedModel = TII->getSchedModel();
779+
SetVector<MachineInstr *> InstrsToUnpack;
780+
for (auto &MI : make_early_inc_range(MBB.instrs())) {
781+
if (!SIInstrInfo::isMFMA(MI))
782+
continue;
783+
const MCSchedClassDesc *SchedClassDesc =
784+
SchedModel.resolveSchedClass(&MI);
785+
uint16_t NumMFMACycles =
786+
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
787+
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
788+
}
789+
for (MachineInstr *MI : InstrsToUnpack) {
790+
performF32Unpacking(*MI);
791+
}
806792
}
807793
}
808794

0 commit comments

Comments
 (0)