@@ -62,7 +62,7 @@ class SIPreEmitPeephole {
6262 // v_fma_f32 v1, v0, v2, v2
6363 // Here, we have overwritten v0 before we use it. This function checks if
6464 // unpacking can lead to such a situation.
65- bool canUnpackingIntroduceDependencies (const MachineInstr &MI);
65+ bool canUnpackingClobberRegister (const MachineInstr &MI);
6666 // Unpack and insert F32 packed instructions, such as V_PK_MUL, V_PK_ADD, and
6767 // V_PK_FMA. Currently, only V_PK_MUL, V_PK_ADD, V_PK_FMA are supported for
6868 // this transformation.
@@ -469,7 +469,7 @@ bool SIPreEmitPeephole::isUnpackingSupportedInstr(MachineInstr &MI) const {
469469 llvm_unreachable (" Fully covered switch" );
470470}
471471
472- bool SIPreEmitPeephole::canUnpackingIntroduceDependencies (
472+ bool SIPreEmitPeephole::canUnpackingClobberRegister (
473473 const MachineInstr &MI) {
474474 unsigned OpCode = MI.getOpcode ();
475475 Register DstReg = MI.getOperand (0 ).getReg ();
@@ -481,14 +481,12 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
481481 // Such scenarios can arise due to specific combinations of op_sel and
482482 // op_sel_hi modifiers.
483483 Register UnpackedDstReg = TRI->getSubReg (DstReg, AMDGPU::sub0);
484- unsigned Src0Mods =
485- TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm ();
486- unsigned Src1Mods =
487- TII->getNamedOperand (MI, AMDGPU::OpName::src1_modifiers)->getImm ();
488484
489485 const MachineOperand *Src0MO = TII->getNamedOperand (MI, AMDGPU::OpName::src0);
490- if (Src0MO->isReg ()) {
486+ if (Src0MO && Src0MO ->isReg ()) {
491487 Register SrcReg0 = Src0MO->getReg ();
488+ unsigned Src0Mods =
489+ TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm ();
492490 Register HiSrc0Reg = (Src0Mods & SISrcMods::OP_SEL_1)
493491 ? TRI->getSubReg (SrcReg0, AMDGPU::sub1)
494492 : TRI->getSubReg (SrcReg0, AMDGPU::sub0);
@@ -499,8 +497,10 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
499497 }
500498
501499 const MachineOperand *Src1MO = TII->getNamedOperand (MI, AMDGPU::OpName::src1);
502- if (Src1MO->isReg ()) {
500+ if (Src1MO && Src1MO ->isReg ()) {
503501 Register SrcReg1 = Src1MO->getReg ();
502+ unsigned Src1Mods =
503+ TII->getNamedOperand (MI, AMDGPU::OpName::src1_modifiers)->getImm ();
504504 Register HiSrc1Reg = (Src1Mods & SISrcMods::OP_SEL_1)
505505 ? TRI->getSubReg (SrcReg1, AMDGPU::sub1)
506506 : TRI->getSubReg (SrcReg1, AMDGPU::sub0);
@@ -511,13 +511,13 @@ bool SIPreEmitPeephole::canUnpackingIntroduceDependencies(
511511 // Applicable for packed instructions with 3 source operands, such as
512512 // V_PK_FMA.
513513 if (AMDGPU::hasNamedOperand (OpCode, AMDGPU::OpName::src2)) {
514- unsigned Src2Mods =
515- TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm ();
516514 const MachineOperand *Src2MO =
517515 TII->getNamedOperand (MI, AMDGPU::OpName::src2);
518- if (Src2MO->isReg ()) {
516+ if (Src2MO && Src2MO ->isReg ()) {
519517 Register SrcReg2 =
520518 TII->getNamedOperand (MI, AMDGPU::OpName::src2)->getReg ();
519+ unsigned Src2Mods =
520+ TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm ();
521521 Register HiSrc2Reg = (Src2Mods & SISrcMods::OP_SEL_1)
522522 ? TRI->getSubReg (SrcReg2, AMDGPU::sub1)
523523 : TRI->getSubReg (SrcReg2, AMDGPU::sub0);
@@ -614,49 +614,46 @@ void SIPreEmitPeephole::collectUnpackingCandidates(
614614 MachineInstr &Instr = *I;
615615 if (Instr.isMetaInstruction ())
616616 continue ;
617- if (Instr.isTerminator ())
618- return ;
619- if (TII->isNeverCoissue (Instr) && !isUnpackingSupportedInstr (Instr))
620- return ;
621- if (SIInstrInfo::modifiesModeRegister (Instr) &&
622- Instr.modifiesRegister (AMDGPU::EXEC, TRI))
617+ if ((Instr.isTerminator ()) ||
618+ (TII->isNeverCoissue (Instr) && !isUnpackingSupportedInstr (Instr)) ||
619+ (SIInstrInfo::modifiesModeRegister (Instr) &&
620+ Instr.modifiesRegister (AMDGPU::EXEC, TRI)))
623621 return ;
622+
624623 const MCSchedClassDesc *InstrSchedClassDesc =
625624 SchedModel.resolveSchedClass (&Instr);
626- TotalCyclesBetweenCandidates + =
625+ uint16_t Latency =
627626 SchedModel.getWriteProcResBegin (InstrSchedClassDesc)->ReleaseAtCycle ;
627+ TotalCyclesBetweenCandidates += Latency;
628628
629- if (TotalCyclesBetweenCandidates > NumMFMACycles)
629+ if (TotalCyclesBetweenCandidates > NumMFMACycles - 1 )
630630 return ;
631631 // Identify register dependencies between those used by the MFMA
632632 // instruction and the following packed instructions. Also checks for
633633 // transitive dependencies between the MFMA def and candidate instruction
634634 // def and uses. Conservatively ensures that we do not incorrectly
635635 // read/write registers.
636636 for (const MachineOperand &InstrMO : Instr.operands ()) {
637- if (InstrMO.isReg ()) {
638- if (TRI->regsOverlap (MFMADef, InstrMO.getReg ()))
639- return ;
640- }
641- }
642- if (isUnpackingSupportedInstr (Instr)) {
643- assert (TII->isNeverCoissue (Instr) && " Instruction cannot be co-issued." );
644- if (canUnpackingIntroduceDependencies (Instr))
637+ if (!InstrMO.isReg () || !InstrMO.getReg ().isValid ())
638+ continue ;
639+ if (TRI->regsOverlap (MFMADef, InstrMO.getReg ()))
645640 return ;
646- // If it is a packed instruction, we should subtract it's latency from the
647- // overall latency calculation here, because the packed instruction will
648- // be removed and replaced by 2 unpacked instructions.
649- TotalCyclesBetweenCandidates -=
650- SchedModel.getWriteProcResBegin (InstrSchedClassDesc)->ReleaseAtCycle ;
651- // We're adding 2 to account for the extra latency added by unpacking into
652- // 2 instructions. At the time of writing, the considered unpacked
653- // instructions have latency of 1.
654- // TODO: improve latency handling of possible inserted instructions.
655- TotalCyclesBetweenCandidates += 2 ;
656- // Subtract 1 to account for MFMA issue latency.
657- if (!(TotalCyclesBetweenCandidates >= NumMFMACycles - 1 ))
658- InstrsToUnpack.insert (&Instr);
659641 }
642+ if (!isUnpackingSupportedInstr (Instr))
643+ continue ;
644+
645+ assert (TII->isNeverCoissue (Instr) && " Instruction cannot be co-issued." );
646+ if (canUnpackingClobberRegister (Instr))
647+ return ;
648+ // If it's a packed instruction, adjust latency: remove the packed
649+ // latency, add latency of two unpacked instructions (currently estimated
650+ // as 2 cycles).
651+ TotalCyclesBetweenCandidates -= Latency;
652+ // TODO: improve latency handling based on instruction modeling.
653+ TotalCyclesBetweenCandidates += 2 ;
654+ // Subtract 1 to account for MFMA issue latency.
655+ if (TotalCyclesBetweenCandidates < NumMFMACycles - 1 )
656+ InstrsToUnpack.insert (&Instr);
660657 }
661658 return ;
662659}
@@ -672,8 +669,7 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
672669 createUnpackedMI (I, UnpackedOpcode, /* IsHiBits=*/ false );
673670 MachineOperand LoDstOp = Op0LOp1L->getOperand (0 );
674671
675- if (DstOp.isUndef ())
676- LoDstOp.setIsUndef ();
672+ LoDstOp.setIsUndef (DstOp.isUndef ());
677673
678674 MachineInstrBuilder Op0HOp1H =
679675 createUnpackedMI (I, UnpackedOpcode, /* IsHiBits=*/ true );
@@ -687,10 +683,9 @@ void SIPreEmitPeephole::performF32Unpacking(MachineInstr &I) {
687683 Op0LOp1L->setFlag (MachineInstr::MIFlag::FmContract);
688684 Op0HOp1H->setFlag (MachineInstr::MIFlag::FmContract);
689685 }
690- if (DstOp.getReg ().isPhysical () && DstOp.isRenamable ()) {
691- LoDstOp.setIsRenamable (true );
692- HiDstOp.setIsRenamable (true );
693- }
686+
687+ LoDstOp.setIsRenamable (DstOp.isRenamable ());
688+ HiDstOp.setIsRenamable (DstOp.isRenamable ());
694689
695690 I.eraseFromParent ();
696691 return ;
@@ -804,22 +799,19 @@ bool SIPreEmitPeephole::run(MachineFunction &MF) {
804799 for (MachineBasicBlock &MBB : MF) {
805800 // Unpack packed instructions overlapped by MFMAs. This allows the compiler
806801 // to co-issue unpacked instructions with MFMA
807- uint16_t NumMFMACycles = 0 ;
808802 auto SchedModel = TII->getSchedModel ();
809803 SetVector<MachineInstr *> InstrsToUnpack;
810804 for (auto &MI : make_early_inc_range (MBB.instrs ())) {
811- if (SIInstrInfo::isMFMA (MI)) {
812- const MCSchedClassDesc *SchedClassDesc =
813- SchedModel. resolveSchedClass (&MI);
814- NumMFMACycles =
815- SchedModel. getWriteProcResBegin (SchedClassDesc)-> ReleaseAtCycle ;
816- collectUnpackingCandidates (MI, InstrsToUnpack, NumMFMACycles) ;
817- }
805+ if (! SIInstrInfo::isMFMA (MI))
806+ continue ;
807+ const MCSchedClassDesc *SchedClassDesc =
808+ SchedModel. resolveSchedClass (&MI);
809+ uint16_t NumMFMACycles =
810+ SchedModel. getWriteProcResBegin (SchedClassDesc)-> ReleaseAtCycle ;
811+ collectUnpackingCandidates (MI, InstrsToUnpack, NumMFMACycles);
818812 }
819- if (!InstrsToUnpack.empty ()) {
820- for (MachineInstr *MI : InstrsToUnpack) {
821- performF32Unpacking (*MI);
822- }
813+ for (MachineInstr *MI : InstrsToUnpack) {
814+ performF32Unpacking (*MI);
823815 }
824816 }
825817
0 commit comments