Skip to content

Commit 8b2fc00

Browse files
authored
[AMDGPU][NFC]: Minor Unpacking Fixes. (llvm#163992)
Optimize condition checks, Remove compilation overhead for unsupported archs
1 parent 79d8a26 commit 8b2fc00

File tree

1 file changed

+17
-30
lines changed

1 file changed

+17
-30
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 17 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
4747
const MachineBasicBlock &From,
4848
const MachineBasicBlock &To) const;
4949
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
50-
// Check if the machine instruction being processed is a supported packed
51-
// instruction.
52-
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
5350
// Creates a list of packed instructions following an MFMA that are suitable
5451
// for unpacking.
5552
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -454,23 +451,6 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
454451
return true;
455452
}
456453

457-
// If support is extended to new operations, add tests in
458-
// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
459-
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
460-
if (!TII->isNeverCoissue(MI))
461-
return false;
462-
unsigned Opcode = MI.getOpcode();
463-
switch (Opcode) {
464-
case AMDGPU::V_PK_ADD_F32:
465-
case AMDGPU::V_PK_MUL_F32:
466-
case AMDGPU::V_PK_FMA_F32:
467-
return true;
468-
default:
469-
return false;
470-
}
471-
llvm_unreachable("Fully covered switch");
472-
}
473-
474454
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
475455
unsigned OpCode = MI.getOpcode();
476456
Register DstReg = MI.getOperand(0).getReg();
@@ -612,10 +592,13 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
612592

613593
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
614594
MachineInstr &Instr = *I;
595+
uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
596+
bool IsUnpackable =
597+
!(UnpackedOpCode == std::numeric_limits<uint16_t>::max());
615598
if (Instr.isMetaInstruction())
616599
continue;
617600
if ((Instr.isTerminator()) ||
618-
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
601+
(TII->isNeverCoissue(Instr) && !IsUnpackable) ||
619602
(SIInstrInfo::modifiesModeRegister(Instr) &&
620603
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
621604
return;
@@ -639,7 +622,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
639622
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
640623
return;
641624
}
642-
if (!isUnpackingSupportedInstr(Instr))
625+
if (!IsUnpackable)
643626
continue;
644627

645628
if (canUnpackingClobberRegister(Instr))
@@ -687,8 +670,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
687670
bool IsHiBits) {
688671
MachineBasicBlock &MBB = *I.getParent();
689672
const DebugLoc &DL = I.getDebugLoc();
690-
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
691-
const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
673+
const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
674+
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
692675
Register DstReg = I.getOperand(0).getReg();
693676
unsigned OpCode = I.getOpcode();
694677
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -702,15 +685,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
702685

703686
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
704687
NewMI.addDef(UnpackedDstReg); // vdst
705-
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
706-
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
688+
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
689+
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
707690

708691
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
709-
const MachineOperand *SrcMO3 =
692+
const MachineOperand *SrcMO2 =
710693
TII->getNamedOperand(I, AMDGPU::OpName::src2);
711694
unsigned Src2Mods =
712695
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
713-
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
696+
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO2);
714697
}
715698
NewMI.addImm(ClampVal); // clamp
716699
// Packed instructions do not support output modifiers. safe to assign them 0
@@ -787,9 +770,13 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
787770

788771
// TODO: Fold this into previous block, if possible. Evaluate and handle any
789772
// side effects.
773+
774+
// Perform the extra MF scans only for supported archs
775+
if (!ST.hasGFX940Insts())
776+
return Changed;
790777
for (MachineBasicBlock &MBB : MF) {
791-
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
792-
// to co-issue unpacked instructions with MFMA
778+
// Unpack packed instructions overlapped by MFMAs. This allows the
779+
// compiler to co-issue unpacked instructions with MFMA
793780
auto SchedModel = TII->getSchedModel();
794781
SetVector<MachineInstr *> InstrsToUnpack;
795782
for (auto &MI : make_early_inc_range(MBB.instrs())) {

0 commit comments

Comments
 (0)