Skip to content

Commit dc0089e

Browse files
committed
add support for v_pk_mov unpacking, redundant code removal, bug fixes
1 parent 19a58a5 commit dc0089e

File tree

2 files changed

+699
-87
lines changed

2 files changed

+699
-87
lines changed

llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp

Lines changed: 110 additions & 75 deletions
Original file line numberDiff line numberDiff line change
@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
4747
const MachineBasicBlock &From,
4848
const MachineBasicBlock &To) const;
4949
bool removeExeczBranch(MachineInstr &MI, MachineBasicBlock &SrcMBB);
50-
// Check if the machine instruction being processed is a supported packed
51-
// instruction.
52-
bool isUnpackingSupportedInstr(MachineInstr &MI) const;
5350
// Creates a list of packed instructions following an MFMA that are suitable
5451
// for unpacking.
5552
void collectUnpackingCandidates(MachineInstr &BeginMI,
@@ -62,7 +59,7 @@ class SIPreEmitPeephole {
6259
// v_fma_f32 v1, v0, v2, v2
6360
// Here, we have overwritten v0 before we use it. This function checks if
6461
// unpacking can lead to such a situation.
65-
bool canUnpackingClobberRegister(const MachineInstr &MI);
62+
bool canUnpackingClobberRegister(MachineInstr &MI);
6663
// Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
6764
// V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
6865
// this transformation.
@@ -456,22 +453,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
456453

457454
// If support is extended to new operations, add tests in
458455
// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
459-
bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
460-
if (!TII->isNeverCoissue(MI))
461-
return false;
462-
unsigned Opcode = MI.getOpcode();
463-
switch (Opcode) {
464-
case AMDGPU::V_PK_ADD_F32:
465-
case AMDGPU::V_PK_MUL_F32:
466-
case AMDGPU::V_PK_FMA_F32:
467-
return true;
468-
default:
469-
return false;
470-
}
471-
llvm_unreachable("Fully covered switch");
472-
}
473456

474-
bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
457+
bool SIPreEmitPeephole::canUnpackingClobberRegister(MachineInstr &MI) {
475458
unsigned OpCode = MI.getOpcode();
476459
Register DstReg = MI.getOperand(0).getReg();
477460
// Only the first register in the register pair needs to be checked due to the
@@ -482,21 +465,9 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
482465
// Such scenarios can arise due to specific combinations of op_sel and
483466
// op_sel_hi modifiers.
484467
Register UnpackedDstReg = TRI->getSubReg(DstReg, AMDGPU::sub0);
485-
486-
const MachineOperand *Src0MO = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
487-
if (Src0MO && Src0MO->isReg()) {
488-
Register SrcReg0 = Src0MO->getReg();
489-
unsigned Src0Mods =
490-
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
491-
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
492-
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
493-
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
494-
// Check if the register selected by op_sel_hi is the same as the first
495-
// register in the destination register pair.
496-
if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
497-
return true;
498-
}
499-
468+
uint16_t UnpackedOpCode = mapToUnpackedOpcode(MI);
469+
bool UnpackedInstHasOneSrcOp =
470+
!AMDGPU::hasNamedOperand(UnpackedOpCode, AMDGPU::OpName::src1);
500471
const MachineOperand *Src1MO = TII->getNamedOperand(MI, AMDGPU::OpName::src1);
501472
if (Src1MO && Src1MO->isReg()) {
502473
Register SrcReg1 = Src1MO->getReg();
@@ -505,25 +476,44 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
505476
Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
506477
? TRI->getSubReg(SrcReg1, AMDGPU::sub1)
507478
: TRI->getSubReg(SrcReg1, AMDGPU::sub0);
479+
// Check if the register selected by op_sel_hi is the same as the first
480+
// register in the destination register pair.
508481
if (TRI->regsOverlap(UnpackedDstReg, HiSrc1Reg))
509482
return true;
510483
}
511484

512-
// Applicable for packed instructions with 3 source operands, such as
513-
// V_PK_FMA.
514-
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
515-
const MachineOperand *Src2MO =
516-
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
517-
if (Src2MO && Src2MO->isReg()) {
518-
Register SrcReg2 = Src2MO->getReg();
519-
unsigned Src2Mods =
520-
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
521-
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
522-
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
523-
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
524-
if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
485+
// V_MOV_B32s have one src operand. Other candidate unpacked instructions with
486+
// 2 or more src operands will perform the following checks.
487+
if (!UnpackedInstHasOneSrcOp) {
488+
const MachineOperand *Src0MO =
489+
TII->getNamedOperand(MI, AMDGPU::OpName::src0);
490+
if (Src0MO && Src0MO->isReg()) {
491+
Register SrcReg0 = Src0MO->getReg();
492+
unsigned Src0Mods =
493+
TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm();
494+
Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
495+
? TRI->getSubReg(SrcReg0, AMDGPU::sub1)
496+
: TRI->getSubReg(SrcReg0, AMDGPU::sub0);
497+
if (TRI->regsOverlap(UnpackedDstReg, HiSrc0Reg))
525498
return true;
526499
}
500+
501+
// Applicable for packed instructions with 3 source operands, such as
502+
// V_PK_FMA.
503+
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
504+
const MachineOperand *Src2MO =
505+
TII->getNamedOperand(MI, AMDGPU::OpName::src2);
506+
if (Src2MO && Src2MO->isReg()) {
507+
Register SrcReg2 = Src2MO->getReg();
508+
unsigned Src2Mods =
509+
TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm();
510+
Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
511+
? TRI->getSubReg(SrcReg2, AMDGPU::sub1)
512+
: TRI->getSubReg(SrcReg2, AMDGPU::sub0);
513+
if (TRI->regsOverlap(UnpackedDstReg, HiSrc2Reg))
514+
return true;
515+
}
516+
}
527517
}
528518
return false;
529519
}
@@ -540,6 +530,11 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
540530
return AMDGPU::V_MUL_F32_e64;
541531
case AMDGPU::V_PK_FMA_F32:
542532
return AMDGPU::V_FMA_F32_e64;
533+
case AMDGPU::V_PK_MOV_B32:
534+
// Source modifiers aren't handled for MOV due to prevailing restrictions.
535+
// Hence, 64-bit encoding isn't necessary. Will create unnecessary
536+
// instruction cache pressure.
537+
return AMDGPU::V_MOV_B32_e32;
543538
default:
544539
return std::numeric_limits<uint16_t>::max();
545540
}
@@ -549,6 +544,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
549544
void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
550545
unsigned SrcMods, bool IsHiBits,
551546
const MachineOperand &SrcMO) {
547+
unsigned NewOpCode = NewMI->getOpcode();
552548
unsigned NewSrcMods = 0;
553549
unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
554550
unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
@@ -561,12 +557,18 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
561557
// modifier for the higher 32 bits. Unpacked VOP3 instructions support
562558
// ABS, but do not support NEG_HI. Therefore we need to explicitly add the
563559
// NEG modifier if present in the packed instruction.
560+
bool IsSrcModifidiersSupported =
561+
AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src0_modifiers);
562+
bool UnpackedInstHasOneSrcOp =
563+
!AMDGPU::hasNamedOperand(NewOpCode, AMDGPU::OpName::src1);
564+
564565
if (SrcMods & NegModifier)
565566
NewSrcMods |= SISrcMods::NEG;
566567
// Src modifiers. Only negative modifiers are added if needed. Unpacked
567568
// operations do not have op_sel, therefore it must be handled explicitly as
568569
// done below.
569-
NewMI.addImm(NewSrcMods);
570+
if (IsSrcModifidiersSupported)
571+
NewMI.addImm(NewSrcMods);
570572
if (SrcMO.isImm()) {
571573
NewMI.addImm(SrcMO.getImm());
572574
return;
@@ -594,7 +596,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
594596
bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
595597
bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
596598
bool KillState = true;
597-
if ((OpSel == OpSelHi) && !IsHiBits)
599+
if ((OpSel == OpSelHi) && !IsHiBits && !UnpackedInstHasOneSrcOp)
598600
KillState = false;
599601
UnpackedSrcMO.setIsKill(KillState);
600602
}
@@ -612,10 +614,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
612614

613615
for (auto I = std::next(BeginMI.getIterator()); I != E; ++I) {
614616
MachineInstr &Instr = *I;
617+
uint16_t UnpackedOpCode = mapToUnpackedOpcode(Instr);
615618
if (Instr.isMetaInstruction())
616619
continue;
617620
if ((Instr.isTerminator()) ||
618-
(TII->isNeverCoissue(Instr) && !isUnpackingSupportedInstr(Instr)) ||
621+
(TII->isNeverCoissue(Instr) &&
622+
(UnpackedOpCode == std::numeric_limits<uint16_t>::max())) ||
619623
(SIInstrInfo::modifiesModeRegister(Instr) &&
620624
Instr.modifiesRegister(AMDGPU::EXEC, TRI)))
621625
return;
@@ -639,7 +643,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
639643
if (TRI->regsOverlap(MFMADef, InstrMO.getReg()))
640644
return;
641645
}
642-
if (!isUnpackingSupportedInstr(Instr))
646+
if (UnpackedOpCode == std::numeric_limits<uint16_t>::max())
643647
continue;
644648

645649
if (canUnpackingClobberRegister(Instr))
@@ -661,8 +665,28 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
661665
MachineOperand DstOp = I.getOperand(0);
662666

663667
uint16_t UnpackedOpcode = mapToUnpackedOpcode(I);
664-
assert(UnpackedOpcode != std::numeric_limits<uint16_t>::max() &&
665-
"Unsupported Opcode");
668+
// V_MOV_B32 does not support source modifiers. Without source modifiers, we
669+
// cannot be faithful to the packed instruction semantics in few cases. This
670+
// is true when the packed instruction has NEG and NEG_HI modifiers. We should
671+
// abort unpacking if:
672+
// 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI.
673+
// 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or
674+
// NEG_HI.
675+
// Packed instructions do not specify ABS modifiers, so we can safely ignore
676+
// those.
677+
if (!AMDGPU::hasNamedOperand(UnpackedOpcode,
678+
AMDGPU::OpName::src0_modifiers)) {
679+
unsigned Src0Mods =
680+
TII->getNamedOperand(I, AMDGPU::OpName::src0_modifiers)->getImm();
681+
unsigned Src1Mods =
682+
TII->getNamedOperand(I, AMDGPU::OpName::src1_modifiers)->getImm();
683+
unsigned negMask0 =
684+
(Src0Mods & SISrcMods::OP_SEL_0) ? SISrcMods::NEG_HI : SISrcMods::NEG;
685+
unsigned negMask1 =
686+
(Src1Mods & SISrcMods::OP_SEL_1) ? SISrcMods::NEG_HI : SISrcMods::NEG;
687+
if ((Src0Mods & negMask0) || (Src1Mods & negMask1))
688+
return;
689+
}
666690

667691
MachineInstrBuilder Op0LOp1L =
668692
createUnpackedMI(I, UnpackedOpcode, /*IsHiBits=*/false);
@@ -689,8 +713,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
689713
bool IsHiBits) {
690714
MachineBasicBlock &MBB = *I.getParent();
691715
const DebugLoc &DL = I.getDebugLoc();
692-
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
693-
const MachineOperand *SrcMO2 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
716+
const MachineOperand *SrcMO0 = TII->getNamedOperand(I, AMDGPU::OpName::src0);
717+
const MachineOperand *SrcMO1 = TII->getNamedOperand(I, AMDGPU::OpName::src1);
694718
Register DstReg = I.getOperand(0).getReg();
695719
unsigned OpCode = I.getOpcode();
696720
Register UnpackedDstReg = IsHiBits ? TRI->getSubReg(DstReg, AMDGPU::sub1)
@@ -704,8 +728,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
704728

705729
MachineInstrBuilder NewMI = BuildMI(MBB, I, DL, TII->get(UnpackedOpcode));
706730
NewMI.addDef(UnpackedDstReg); // vdst
707-
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO1);
708-
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO2);
731+
if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src0) &&
732+
AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::src1)) {
733+
addOperandAndMods(NewMI, Src0Mods, IsHiBits, *SrcMO0);
734+
addOperandAndMods(NewMI, Src1Mods, IsHiBits, *SrcMO1);
735+
} else {
736+
const MachineOperand *SrcMO = IsHiBits ? SrcMO1 : SrcMO0;
737+
unsigned SrcMods = IsHiBits ? Src1Mods : Src0Mods;
738+
addOperandAndMods(NewMI, SrcMods, IsHiBits, *SrcMO);
739+
}
709740

710741
if (AMDGPU::hasNamedOperand(OpCode, AMDGPU::OpName::src2)) {
711742
const MachineOperand *SrcMO3 =
@@ -714,10 +745,12 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
714745
TII->getNamedOperand(I, AMDGPU::OpName::src2_modifiers)->getImm();
715746
addOperandAndMods(NewMI, Src2Mods, IsHiBits, *SrcMO3);
716747
}
717-
NewMI.addImm(ClampVal); // clamp
748+
if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::clamp))
749+
NewMI.addImm(ClampVal); // clamp
718750
// Packed instructions do not support output modifiers. safe to assign them 0
719751
// for this use case
720-
NewMI.addImm(0); // omod
752+
if (AMDGPU::hasNamedOperand(UnpackedOpcode, AMDGPU::OpName::omod))
753+
NewMI.addImm(0); // omod
721754
return NewMI;
722755
}
723756

@@ -789,22 +822,24 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
789822

790823
// TODO: Fold this into previous block, if possible. Evaluate and handle any
791824
// side effects.
792-
for (MachineBasicBlock &MBB : MF) {
793-
// Unpack packed instructions overlapped by MFMAs. This allows the compiler
794-
// to co-issue unpacked instructions with MFMA
795-
auto SchedModel = TII->getSchedModel();
796-
SetVector<MachineInstr *> InstrsToUnpack;
797-
for (auto &MI : make_early_inc_range(MBB.instrs())) {
798-
if (!SIInstrInfo::isMFMA(MI))
799-
continue;
800-
const MCSchedClassDesc *SchedClassDesc =
801-
SchedModel.resolveSchedClass(&MI);
802-
uint16_t NumMFMACycles =
803-
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
804-
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
805-
}
806-
for (MachineInstr *MI : InstrsToUnpack) {
807-
performF32Unpacking(*MI);
825+
if (ST.hasGFX950Insts() || ST.hasGFX940Insts()) {
826+
for (MachineBasicBlock &MBB : MF) {
827+
// Unpack packed instructions overlapped by MFMAs. This allows the
828+
// compiler to co-issue unpacked instructions with MFMA
829+
auto SchedModel = TII->getSchedModel();
830+
SetVector<MachineInstr *> InstrsToUnpack;
831+
for (auto &MI : make_early_inc_range(MBB.instrs())) {
832+
if (!SIInstrInfo::isMFMA(MI))
833+
continue;
834+
const MCSchedClassDesc *SchedClassDesc =
835+
SchedModel.resolveSchedClass(&MI);
836+
uint16_t NumMFMACycles =
837+
SchedModel.getWriteProcResBegin(SchedClassDesc)->ReleaseAtCycle;
838+
collectUnpackingCandidates(MI, InstrsToUnpack, NumMFMACycles);
839+
}
840+
for (MachineInstr *MI : InstrsToUnpack) {
841+
performF32Unpacking(*MI);
842+
}
808843
}
809844
}
810845

0 commit comments

Comments
 (0)