Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
183 changes: 110 additions & 73 deletions llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,6 @@ class SIPreEmitPeephole {
const MachineBasicBlock &From,
const MachineBasicBlock &To) const;
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
// Check if the machine instruction being processed is a supported packed
// instruction.
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
// Creates a list of packed instructions following an MFMA that are suitable
// for unpacking.
void collectUnpackingCandidates(MachineInstr &BeginMI,
Expand All @@ -62,7 +59,7 @@ class SIPreEmitPeephole {
// v_fma_f32 v1, v0, v2, v2
// Here, we have overwritten v0 before we use it. This function checks if
// unpacking can lead to such a situation.
bool canUnpackingClobberRegister(const MachineInstr &MI);
bool canUnpackingClobberRegister(MachineInstr &MI);
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
// this transformation.
Expand Down Expand Up @@ -456,22 +453,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,

// If support is extended to new operations, add tests in
// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
Copy link
Contributor

@jrbyrnes jrbyrnes Oct 16, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you handle the removal of this in a separate NFC PR? I need it for something related, and I don't think it should be blocked by adding V_PK_MOV.

if (!TII->isNeverCoissue(MI))
return false;
unsigned Opcode = MI.getOpcode();
switch (Opcode) {
case AMDGPU::V_PK_ADD_F32:
case AMDGPU::V_PK_MUL_F32:
case AMDGPU::V_PK_FMA_F32:
return true;
default:
return false;
}
llvm_unreachable("Fully covered switch");
}

bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
bool SIPreEmitPeephole::canUnpackingClobberRegister(MachineInstr &MI) {
unsigned OpCode = MI.getOpcode();
Register DstReg = MI.getOperand(0).getReg();
// Only the first register in the register pair needs to be checked due to the
Expand All @@ -482,21 +465,9 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
// Such scenarios can arise due to specific combinations of op_sel and
// op_sel_hi modifiers.
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);

const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (Src0MO && Src0MO->isReg()) {
Register SrcReg0 = Src0MO->getReg();
unsigned Src0Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
// Check if the register selected by op_sel_hi is the same as the first
// register in the destination register pair.
if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
return true;
}

uint16_t UnpackedOpCode = mapToUnpackedOpcode(MI);
bool UnpackedInstHasOneSrcOp =
!AMDGPU::hasNamedOperand(UnpackedOpCode, AMDGPU::OpName::src1);
const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
if (Src1MO && Src1MO->isReg()) {
Register SrcReg1 = Src1MO->getReg();
Expand All @@ -505,25 +476,44 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
: TRI->getSubReg(SrcReg1, AMDGPU::sub0);
// Check if the register selected by op_sel_hi is the same as the first
// register in the destination register pair.
if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
return true;
}

// Applicable for packed instructions with 3 source operands, such as
// V_PK_FMA.
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
const MachineOperand *Src2MO =
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2MO && Src2MO->isReg()) {
Register SrcReg2 = Src2MO->getReg();
unsigned Src2Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
// V_MOV_B32s have one src operand. Other candidate unpacked instructions with
// 2 or more src operands will perform the following checks.
if (!UnpackedInstHasOneSrcOp) {
const MachineOperand *Src0MO =
TII->getNamedOperand(MI, AMDGPU::OpName::src0);
if (Src0MO && Src0MO->isReg()) {
Register SrcReg0 = Src0MO->getReg();
unsigned Src0Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
return true;
}

// Applicable for packed instructions with 3 source operands, such as
// V_PK_FMA.
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
const MachineOperand *Src2MO =
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
if (Src2MO && Src2MO->isReg()) {
Register SrcReg2 = Src2MO->getReg();
unsigned Src2Mods =
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
return true;
}
}
}
return false;
}
Expand All @@ -540,6 +530,11 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
return AMDGPU::V_MUL_F32_e64;
case AMDGPU::V_PK_FMA_F32:
return AMDGPU::V_FMA_F32_e64;
case AMDGPU::V_PK_MOV_B32:
// Source modifiers aren't handled for MOV due to prevailing restrictions.
// Hence, 64-bit encoding isn't necessary. Will create unnecessary
// instruction cache pressure.
return AMDGPU::V_MOV_B32_e32;
default:
return std::numeric_limits<uint16_t>::max();
}
Expand All @@ -549,6 +544,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
unsigned SrcMods, bool IsHiBits,
const MachineOperand &SrcMO) {
unsigned NewOpCode = NewMI->getOpcode();
unsigned NewSrcMods = 0;
unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
Expand All @@ -561,12 +557,18 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
// modifier for the higher 32 bits. Unpacked VOP3 instructions support
// ABS, but do not support NEG_HI. Therefore we need to explicitly add the
// NEG modifier if present in the packed instruction.
bool IsSrcModifidiersSupported =
AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src0_modifiers);
bool UnpackedInstHasOneSrcOp =
!AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src1);

if (SrcMods & NegModifier)
NewSrcMods |= SISrcMods::NEG;
// Src modifiers. Only negative modifiers are added if needed. Unpacked
// operations do not have op_sel, therefore it must be handled explicitly as
// done below.
NewMI.addImm(NewSrcMods);
if (IsSrcModifidiersSupported)
NewMI.addImm(NewSrcMods);
if (SrcMO.isImm()) {
NewMI.addImm(SrcMO.getImm());
return;
Expand Down Expand Up @@ -594,7 +596,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
bool KillState = true;
if ((OpSel == OpSelHi) && !IsHiBits)
if ((OpSel == OpSelHi) && !IsHiBits && !UnpackedInstHasOneSrcOp)
KillState = false;
UnpackedSrcMO.setIsKill(KillState);
}
Expand All @@ -612,10 +614,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(

for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
MachineInstr &Instr = *I;
uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
if (Instr.isMetaInstruction())
continue;
if ((Instr.isTerminator()) ||
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
(TII->isNeverCoissue(Instr) &&
(UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
(SIInstrInfo::modifiesModeRegister(Instr) &&
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
return;
Expand All @@ -639,7 +643,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
return;
}
if (!isUnpackingSupportedInstr(Instr))
if (UnpackedOpCode == std::numeric_limits<uint16_t>::max())
continue;

if (canUnpackingClobberRegister(Instr))
Expand All @@ -663,6 +667,28 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
"Unsupported Opcode");
// V_MOV_B32 does not support source modifiers. Without source modifiers, we
// cannot be faithful to the packed instruction semantics in few cases. This
// is true when the packed instruction has NEG and NEG_HI modifiers. We should
// abort unpacking if:
// 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI.
// 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or
// NEG_HI.
// Packed instructions do not specify ABS modifiers, so we can safely ignore
// those.
if (!AMDGPU::hasNamedOperand(UnpackedOpcode,
AMDGPU::OpName::src0_modifiers)) {
unsigned Src0Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
unsigned Src1Mods =
TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
unsigned negMask0 =
(Src0Mods & SISrcMods::OP_SEL_0) ? SISrcMods::NEG_HI : SISrcMods::NEG;
unsigned negMask1 =
(Src1Mods & SISrcMods::OP_SEL_1) ? SISrcMods::NEG_HI : SISrcMods::NEG;
if ((Src0Mods & negMask0) || (Src1Mods & negMask1))
return;
}

MachineInstrBuilder Op0LOp1L =
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
Expand All @@ -689,8 +715,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
bool IsHiBits) {
MachineBasicBlock &MBB = *I.getParent();
const DebugLoc &DL = I.getDebugLoc();
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
Register DstReg = I.getOperand(0).getReg();
unsigned OpCode = I.getOpcode();
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
Expand All @@ -704,8 +730,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,

MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
NewMI.addDef(UnpackedDstReg); // vdst
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src0) &&
AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src1)) {
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
} else {
const MachineOperand *SrcMO = IsHiBits ? SrcMO1 : SrcMO0;
unsigned SrcMods = IsHiBits ? Src1Mods : Src0Mods;
addOperandAndMods(NewMI, SrcMods, IsHiBits, *SrcMO);
}

if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
const MachineOperand *SrcMO3 =
Expand All @@ -714,10 +747,12 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
}
NewMI.addImm(ClampVal); // clamp
if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::clamp))
NewMI.addImm(ClampVal); // clamp
// Packed instructions do not support output modifiers. safe to assign them 0
// for this use case
NewMI.addImm(0); // omod
if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::omod))
NewMI.addImm(0); // omod
return NewMI;
}

Expand Down Expand Up @@ -789,22 +824,24 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {

// TODO: Fold this into previous block, if possible. Evaluate and handle any
// side effects.
for (MachineBasicBlock &MBB : MF) {
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
// to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
if (!SIInstrInfo::isMFMA(MI))
continue;
const MCSchedClassDesc *SchedClassDesc =
SchedModel.resolveSchedClass(&MI);
uint16_t NumMFMACycles =
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
}
for (MachineInstr *MI : InstrsToUnpack) {
performF32Unpacking(*MI);
if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
for (MachineBasicBlock &MBB : MF) {
// Unpack packed instructions overlapped by MFMAs. This allows the
// compiler to co-issue unpacked instructions with MFMA
auto SchedModel = TII->getSchedModel();
SetVector<MachineInstr *> InstrsToUnpack;
for (auto &MI : make_early_inc_range(MBB.instrs())) {
if (!SIInstrInfo::isMFMA(MI))
continue;
const MCSchedClassDesc *SchedClassDesc =
SchedModel.resolveSchedClass(&MI);
uint16_t NumMFMACycles =
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
}
for (MachineInstr *MI : InstrsToUnpack) {
performF32Unpacking(*MI);
}
}
}

Expand Down
Loading
Loading