@@ -47,9 +47,6 @@ class SIPreEmitPeephole {
4747 const MachineBasicBlock &From,
4848 const MachineBasicBlock &To) const ;
4949 bool removeExeczBranch (MachineInstr &MI, MachineBasicBlock &SrcMBB);
50- // Check if the machine instruction being processed is a supported packed
51- // instruction.
52- bool isUnpackingSupportedInstr (MachineInstr &MI) const ;
5350 // Creates a list of packed instructions following an MFMA that are suitable
5451 // for unpacking.
5552 void collectUnpackingCandidates (MachineInstr &BeginMI,
@@ -62,7 +59,7 @@ class SIPreEmitPeephole {
6259 // v_fma_f32 v1, v0, v2, v2
6360 // Here, we have overwritten v0 before we use it. This function checks if
6461 // unpacking can lead to such a situation.
65- bool canUnpackingClobberRegister (const MachineInstr &MI);
62+ bool canUnpackingClobberRegister (MachineInstr &MI);
6663 // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
6764 // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
6865 // this transformation.
@@ -456,22 +453,8 @@ bool SIPreEmitPeephole::removeExeczBranch(MachineInstr &MI,
456453
457454// If support is extended to new operations, add tests in
458455// llvm/test/CodeGen/AMDGPU/unpack-non-coissue-insts-post-ra-scheduler.mir.
459- bool SIPreEmitPeephole::isUnpackingSupportedInstr (MachineInstr &MI) const {
460- if (!TII->isNeverCoissue (MI))
461- return false ;
462- unsigned Opcode = MI.getOpcode ();
463- switch (Opcode) {
464- case AMDGPU::V_PK_ADD_F32:
465- case AMDGPU::V_PK_MUL_F32:
466- case AMDGPU::V_PK_FMA_F32:
467- return true ;
468- default :
469- return false ;
470- }
471- llvm_unreachable (" Fully covered switch" );
472- }
473456
474- bool SIPreEmitPeephole::canUnpackingClobberRegister (const MachineInstr &MI) {
457+ bool SIPreEmitPeephole::canUnpackingClobberRegister (MachineInstr &MI) {
475458 unsigned OpCode = MI.getOpcode ();
476459 Register DstReg = MI.getOperand (0 ).getReg ();
477460 // Only the first register in the register pair needs to be checked due to the
@@ -482,21 +465,9 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
482465 // Such scenarios can arise due to specific combinations of op_sel and
483466 // op_sel_hi modifiers.
484467 Register UnpackedDstReg = TRI->getSubReg (DstReg, AMDGPU::sub0);
485-
486- const MachineOperand *Src0MO = TII->getNamedOperand (MI, AMDGPU::OpName::src0);
487- if (Src0MO && Src0MO->isReg ()) {
488- Register SrcReg0 = Src0MO->getReg ();
489- unsigned Src0Mods =
490- TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm ();
491- Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
492- ? TRI->getSubReg (SrcReg0, AMDGPU::sub1)
493- : TRI->getSubReg (SrcReg0, AMDGPU::sub0);
494- // Check if the register selected by op_sel_hi is the same as the first
495- // register in the destination register pair.
496- if (TRI->regsOverlap (UnpackedDstReg, HiSrc0Reg))
497- return true ;
498- }
499-
468+ uint16_t UnpackedOpCode = mapToUnpackedOpcode (MI);
469+ bool UnpackedInstHasOneSrcOp =
470+ !AMDGPU::hasNamedOperand (UnpackedOpCode, AMDGPU::OpName::src1);
500471 const MachineOperand *Src1MO = TII->getNamedOperand (MI, AMDGPU::OpName::src1);
501472 if (Src1MO && Src1MO->isReg ()) {
502473 Register SrcReg1 = Src1MO->getReg ();
@@ -505,25 +476,44 @@ bool SIPreEmitPeephole::canUnpackingClobberRegister(const MachineInstr &MI) {
505476 Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
506477 ? TRI->getSubReg (SrcReg1, AMDGPU::sub1)
507478 : TRI->getSubReg (SrcReg1, AMDGPU::sub0);
479+ // Check if the register selected by op_sel_hi is the same as the first
480+ // register in the destination register pair.
508481 if (TRI->regsOverlap (UnpackedDstReg, HiSrc1Reg))
509482 return true ;
510483 }
511484
512- // Applicable for packed instructions with 3 source operands, such as
513- // V_PK_FMA .
514- if (AMDGPU::hasNamedOperand (OpCode, AMDGPU::OpName::src2) ) {
515- const MachineOperand *Src2MO =
516- TII->getNamedOperand (MI, AMDGPU::OpName::src2 );
517- if (Src2MO && Src2MO ->isReg ()) {
518- Register SrcReg2 = Src2MO ->getReg ();
519- unsigned Src2Mods =
520- TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers )->getImm ();
521- Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
522- ? TRI->getSubReg (SrcReg2 , AMDGPU::sub1)
523- : TRI->getSubReg (SrcReg2 , AMDGPU::sub0);
524- if (TRI->regsOverlap (UnpackedDstReg, HiSrc2Reg ))
485+ // V_MOV_B32s have one src operand. Other candidate unpacked instructions with
486+ // 2 or more src operands will perform the following checks .
487+ if (!UnpackedInstHasOneSrcOp ) {
488+ const MachineOperand *Src0MO =
489+ TII->getNamedOperand (MI, AMDGPU::OpName::src0 );
490+ if (Src0MO && Src0MO ->isReg ()) {
491+ Register SrcReg0 = Src0MO ->getReg ();
492+ unsigned Src0Mods =
493+ TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers )->getImm ();
494+ Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
495+ ? TRI->getSubReg (SrcReg0 , AMDGPU::sub1)
496+ : TRI->getSubReg (SrcReg0 , AMDGPU::sub0);
497+ if (TRI->regsOverlap (UnpackedDstReg, HiSrc0Reg ))
525498 return true ;
526499 }
500+
501+ // Applicable for packed instructions with 3 source operands, such as
502+ // V_PK_FMA.
503+ if (AMDGPU::hasNamedOperand (OpCode, AMDGPU::OpName::src2)) {
504+ const MachineOperand *Src2MO =
505+ TII->getNamedOperand (MI, AMDGPU::OpName::src2);
506+ if (Src2MO && Src2MO->isReg ()) {
507+ Register SrcReg2 = Src2MO->getReg ();
508+ unsigned Src2Mods =
509+ TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm ();
510+ Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
511+ ? TRI->getSubReg (SrcReg2, AMDGPU::sub1)
512+ : TRI->getSubReg (SrcReg2, AMDGPU::sub0);
513+ if (TRI->regsOverlap (UnpackedDstReg, HiSrc2Reg))
514+ return true ;
515+ }
516+ }
527517 }
528518 return false ;
529519}
@@ -540,6 +530,11 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
540530 return AMDGPU::V_MUL_F32_e64;
541531 case AMDGPU::V_PK_FMA_F32:
542532 return AMDGPU::V_FMA_F32_e64;
533+ case AMDGPU::V_PK_MOV_B32:
534+ // Source modifiers aren't handled for MOV due to prevailing restrictions.
535+ // Hence, 64-bit encoding isn't necessary. Will create unnecessary
536+ // instruction cache pressure.
537+ return AMDGPU::V_MOV_B32_e32;
543538 default :
544539 return std::numeric_limits<uint16_t >::max ();
545540 }
@@ -549,6 +544,7 @@ uint16_t SIPreEmitPeephole::mapToUnpackedOpcode(MachineInstr &I) {
549544void SIPreEmitPeephole::addOperandAndMods (MachineInstrBuilder &NewMI,
550545 unsigned SrcMods, bool IsHiBits,
551546 const MachineOperand &SrcMO) {
547+ unsigned NewOpCode = NewMI->getOpcode ();
552548 unsigned NewSrcMods = 0 ;
553549 unsigned NegModifier = IsHiBits ? SISrcMods::NEG_HI : SISrcMods::NEG;
554550 unsigned OpSelModifier = IsHiBits ? SISrcMods::OP_SEL_1 : SISrcMods::OP_SEL_0;
@@ -561,12 +557,18 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
561557 // modifier for the higher 32 bits. Unpacked VOP3 instructions support
562558 // ABS, but do not support NEG_HI. Therefore we need to explicitly add the
563559 // NEG modifier if present in the packed instruction.
560+ bool IsSrcModifidiersSupported =
561+ AMDGPU::hasNamedOperand (NewOpCode, AMDGPU::OpName::src0_modifiers);
562+ bool UnpackedInstHasOneSrcOp =
563+ !AMDGPU::hasNamedOperand (NewOpCode, AMDGPU::OpName::src1);
564+
564565 if (SrcMods & NegModifier)
565566 NewSrcMods |= SISrcMods::NEG;
566567 // Src modifiers. Only negative modifiers are added if needed. Unpacked
567568 // operations do not have op_sel, therefore it must be handled explicitly as
568569 // done below.
569- NewMI.addImm (NewSrcMods);
570+ if (IsSrcModifidiersSupported)
571+ NewMI.addImm (NewSrcMods);
570572 if (SrcMO.isImm ()) {
571573 NewMI.addImm (SrcMO.getImm ());
572574 return ;
@@ -594,7 +596,7 @@ void SIPreEmitPeephole::addOperandAndMods(MachineInstrBuilder &NewMI,
594596 bool OpSel = SrcMods & SISrcMods::OP_SEL_0;
595597 bool OpSelHi = SrcMods & SISrcMods::OP_SEL_1;
596598 bool KillState = true ;
597- if ((OpSel == OpSelHi) && !IsHiBits)
599+ if ((OpSel == OpSelHi) && !IsHiBits && !UnpackedInstHasOneSrcOp )
598600 KillState = false ;
599601 UnpackedSrcMO.setIsKill (KillState);
600602 }
@@ -612,10 +614,12 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
612614
613615 for (auto I = std::next (BeginMI.getIterator ()); I != E; ++I) {
614616 MachineInstr &Instr = *I;
617+ uint16_t UnpackedOpCode = mapToUnpackedOpcode (Instr);
615618 if (Instr.isMetaInstruction ())
616619 continue ;
617620 if ((Instr.isTerminator ()) ||
618- (TII->isNeverCoissue (Instr) && !isUnpackingSupportedInstr (Instr)) ||
621+ (TII->isNeverCoissue (Instr) &&
622+ (UnpackedOpCode == std::numeric_limits<uint16_t >::max ())) ||
619623 (SIInstrInfo::modifiesModeRegister (Instr) &&
620624 Instr.modifiesRegister (AMDGPU::EXEC, TRI)))
621625 return ;
@@ -639,7 +643,7 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
639643 if (TRI->regsOverlap (MFMADef, InstrMO.getReg ()))
640644 return ;
641645 }
642- if (! isUnpackingSupportedInstr (Instr ))
646+ if (UnpackedOpCode == std::numeric_limits< uint16_t >:: max ( ))
643647 continue ;
644648
645649 if (canUnpackingClobberRegister (Instr))
@@ -661,8 +665,28 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
661665 MachineOperand DstOp = I.getOperand (0 );
662666
663667 uint16_t UnpackedOpcode = mapToUnpackedOpcode (I);
664- assert (UnpackedOpcode != std::numeric_limits<uint16_t >::max () &&
665- " Unsupported Opcode" );
668+ // V_MOV_B32 does not support source modifiers. Without source modifiers, we
669+ // cannot be faithful to the packed instruction semantics in few cases. This
670+ // is true when the packed instruction has NEG and NEG_HI modifiers. We should
671+ // abort unpacking if:
672+ // 1. hi/lo bits selected by OPSEL for src0 are also marked by NEG or NEG_HI.
673+ // 2. hi/lo bits selected by OPSEL_HI for src1 are also marked by NEG or
674+ // NEG_HI.
675+ // Packed instructions do not specify ABS modifiers, so we can safely ignore
676+ // those.
677+ if (!AMDGPU::hasNamedOperand (UnpackedOpcode,
678+ AMDGPU::OpName::src0_modifiers)) {
679+ unsigned Src0Mods =
680+ TII->getNamedOperand (I, AMDGPU::OpName::src0_modifiers)->getImm ();
681+ unsigned Src1Mods =
682+ TII->getNamedOperand (I, AMDGPU::OpName::src1_modifiers)->getImm ();
683+ unsigned negMask0 =
684+ (Src0Mods & SISrcMods::OP_SEL_0) ? SISrcMods::NEG_HI : SISrcMods::NEG;
685+ unsigned negMask1 =
686+ (Src1Mods & SISrcMods::OP_SEL_1) ? SISrcMods::NEG_HI : SISrcMods::NEG;
687+ if ((Src0Mods & negMask0) || (Src1Mods & negMask1))
688+ return ;
689+ }
666690
667691 MachineInstrBuilder Op0LOp1L =
668692 createUnpackedMI (I, UnpackedOpcode, /* IsHiBits=*/ false );
@@ -689,8 +713,8 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
689713 bool IsHiBits) {
690714 MachineBasicBlock &MBB = *I.getParent ();
691715 const DebugLoc &DL = I.getDebugLoc ();
692- const MachineOperand *SrcMO1 = TII->getNamedOperand (I, AMDGPU::OpName::src0);
693- const MachineOperand *SrcMO2 = TII->getNamedOperand (I, AMDGPU::OpName::src1);
716+ const MachineOperand *SrcMO0 = TII->getNamedOperand (I, AMDGPU::OpName::src0);
717+ const MachineOperand *SrcMO1 = TII->getNamedOperand (I, AMDGPU::OpName::src1);
694718 Register DstReg = I.getOperand (0 ).getReg ();
695719 unsigned OpCode = I.getOpcode ();
696720 Register UnpackedDstReg = IsHiBits ? TRI->getSubReg (DstReg, AMDGPU::sub1)
@@ -704,8 +728,15 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
704728
705729 MachineInstrBuilder NewMI = BuildMI (MBB, I, DL, TII->get (UnpackedOpcode));
706730 NewMI.addDef (UnpackedDstReg); // vdst
707- addOperandAndMods (NewMI, Src0Mods, IsHiBits, *SrcMO1);
708- addOperandAndMods (NewMI, Src1Mods, IsHiBits, *SrcMO2);
731+ if (AMDGPU::hasNamedOperand (UnpackedOpcode, AMDGPU::OpName::src0) &&
732+ AMDGPU::hasNamedOperand (UnpackedOpcode, AMDGPU::OpName::src1)) {
733+ addOperandAndMods (NewMI, Src0Mods, IsHiBits, *SrcMO0);
734+ addOperandAndMods (NewMI, Src1Mods, IsHiBits, *SrcMO1);
735+ } else {
736+ const MachineOperand *SrcMO = IsHiBits ? SrcMO1 : SrcMO0;
737+ unsigned SrcMods = IsHiBits ? Src1Mods : Src0Mods;
738+ addOperandAndMods (NewMI, SrcMods, IsHiBits, *SrcMO);
739+ }
709740
710741 if (AMDGPU::hasNamedOperand (OpCode, AMDGPU::OpName::src2)) {
711742 const MachineOperand *SrcMO3 =
@@ -714,10 +745,12 @@ MachineInstrBuilder SIPreEmitPeephole::createUnpackedMI(MachineInstr &I,
714745 TII->getNamedOperand (I, AMDGPU::OpName::src2_modifiers)->getImm ();
715746 addOperandAndMods (NewMI, Src2Mods, IsHiBits, *SrcMO3);
716747 }
717- NewMI.addImm (ClampVal); // clamp
748+ if (AMDGPU::hasNamedOperand (UnpackedOpcode, AMDGPU::OpName::clamp))
749+ NewMI.addImm (ClampVal); // clamp
718750 // Packed instructions do not support output modifiers. safe to assign them 0
719751 // for this use case
720- NewMI.addImm (0 ); // omod
752+ if (AMDGPU::hasNamedOperand (UnpackedOpcode, AMDGPU::OpName::omod))
753+ NewMI.addImm (0 ); // omod
721754 return NewMI;
722755}
723756
@@ -789,22 +822,24 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
789822
790823 // TODO: Fold this into previous block, if possible. Evaluate and handle any
791824 // side effects.
792- for (MachineBasicBlock &MBB : MF) {
793- // Unpack packed instructions overlapped by MFMAs. This allows the compiler
794- // to co-issue unpacked instructions with MFMA
795- auto SchedModel = TII->getSchedModel ();
796- SetVector<MachineInstr *> InstrsToUnpack;
797- for (auto &MI : make_early_inc_range (MBB.instrs ())) {
798- if (!SIInstrInfo::isMFMA (MI))
799- continue ;
800- const MCSchedClassDesc *SchedClassDesc =
801- SchedModel.resolveSchedClass (&MI);
802- uint16_t NumMFMACycles =
803- SchedModel.getWriteProcResBegin (SchedClassDesc)->ReleaseAtCycle ;
804- collectUnpackingCandidates (MI, InstrsToUnpack, NumMFMACycles);
805- }
806- for (MachineInstr *MI : InstrsToUnpack) {
807- performF32Unpacking (*MI);
825+ if (ST.hasGFX950Insts () || ST.hasGFX940Insts ()) {
826+ for (MachineBasicBlock &MBB : MF) {
827+ // Unpack packed instructions overlapped by MFMAs. This allows the
828+ // compiler to co-issue unpacked instructions with MFMA
829+ auto SchedModel = TII->getSchedModel ();
830+ SetVector<MachineInstr *> InstrsToUnpack;
831+ for (auto &MI : make_early_inc_range (MBB.instrs ())) {
832+ if (!SIInstrInfo::isMFMA (MI))
833+ continue ;
834+ const MCSchedClassDesc *SchedClassDesc =
835+ SchedModel.resolveSchedClass (&MI);
836+ uint16_t NumMFMACycles =
837+ SchedModel.getWriteProcResBegin (SchedClassDesc)->ReleaseAtCycle ;
838+ collectUnpackingCandidates (MI, InstrsToUnpack, NumMFMACycles);
839+ }
840+ for (MachineInstr *MI : InstrsToUnpack) {
841+ performF32Unpacking (*MI);
842+ }
808843 }
809844 }
810845
0 commit comments