@@ -875,13 +875,78 @@ GCNHazardRecognizer::checkVALUHazardsHelper(const MachineOperand &Def,
875875 return DataIdx >= 0 &&
876876 TRI->regsOverlap (MI.getOperand (DataIdx).getReg (), Reg);
877877 };
878+
878879 int WaitStatesNeededForDef =
879880 VALUWaitStates - getWaitStatesSince (IsHazardFn, VALUWaitStates);
880881 WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
881882
882883 return WaitStatesNeeded;
883884}
884885
886+ // / Dest sel forwarding issue occurs if additional logic is needed to swizzle /
887+ // / pack the computed value into correct bit position of the dest register. This
888+ // / occurs if we have SDWA with dst_sel != DWORD or if we have op_sel with
889+ // / dst_sel that is not aligned to the register. This function analayzes the \p
890+ // / MI and \returns an operand with dst forwarding issue, or nullptr if
891+ // / none exists.
892+ static const MachineOperand *
893+ getDstSelForwardingOperand (const MachineInstr &MI, const GCNSubtarget &ST) {
894+ if (!SIInstrInfo::isVALU (MI))
895+ return nullptr ;
896+
897+ const SIInstrInfo *TII = ST.getInstrInfo ();
898+
899+ unsigned Opcode = MI.getOpcode ();
900+
901+ // There are three different types of instructions
902+ // which produce forwarded dest: 1. SDWA with dst_sel != DWORD, 2. VOP3
903+ // which write hi bits (e.g. op_sel[3] == 1), and 3. CVR_SR_FP8_F32 and
904+ // CVT_SR_BF8_F32 with op_sel[3:2]
905+ // != 0
906+ if (SIInstrInfo::isSDWA (MI)) {
907+ // Type 1: SDWA with dst_sel != DWORD
908+ if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
909+ if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
910+ return nullptr ;
911+ } else {
912+ // Type 2 && Type 3: (VOP3 which write the hi bits) || (CVT_SR_FP8_F32 and
913+ // CVT_SR_BF8_F32 with op_sel[3:2] != 0)
914+ if (!AMDGPU::hasNamedOperand (Opcode, AMDGPU::OpName::op_sel) ||
915+ !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)->getImm () &
916+ SISrcMods::DST_OP_SEL ||
917+ (AMDGPU::isFP8DstSelInst (Opcode) &&
918+ (TII->getNamedOperand (MI, AMDGPU::OpName::src2_modifiers)->getImm () &
919+ SISrcMods::OP_SEL_0))))
920+ return nullptr ;
921+ }
922+
923+ return TII->getNamedOperand (MI, AMDGPU::OpName::vdst);
924+ }
925+
926+ // / Checks whether the provided \p MI "consumes" the operand with a Dest sel
927+ // / fowarding issue \p Dst . We may "consume" the Dst via a standard explicit
928+ // / RAW, or through irregular ways (e.g implicit RAW, certain types of WAW)
929+ static bool consumesDstSelForwardingOperand (const MachineInstr *VALU,
930+ const MachineOperand *Dst,
931+ const SIRegisterInfo *TRI) {
932+ // We must consider implicit reads of the VALU. SDWA with dst_sel and
933+ // UNUSED_PRESERVE will implicitly read the result from forwarded dest,
934+ // and we must account for that hazard.
935+ // We also must account for WAW hazards. In particular, WAW with dest
936+ // preserve semantics (e.g. VOP3 with op_sel, VOP2 &&
937+ // !zeroesHigh16BitsOfDest) will read the forwarded dest for parity
938+ // check for ECC. Without accounting for this hazard, the ECC will be
939+ // wrong.
940+ // TODO: limit to RAW (including implicit reads) + problematic WAW (i.e.
941+ // complete zeroesHigh16BitsOfDest)
942+ for (auto &Operand : VALU->operands ()) {
943+ if (Operand.isReg () && TRI->regsOverlap (Dst->getReg (), Operand.getReg ())) {
944+ return true ;
945+ }
946+ }
947+ return false ;
948+ }
949+
885950int GCNHazardRecognizer::checkVALUHazards (MachineInstr *VALU) {
886951 int WaitStatesNeeded = 0 ;
887952
@@ -912,27 +977,18 @@ int GCNHazardRecognizer::checkVALUHazards(MachineInstr *VALU) {
912977 if (ST.hasDstSelForwardingHazard ()) {
913978 const int Shift16DefWaitstates = 1 ;
914979
915- auto IsShift16BitDefFn = [this , VALU](const MachineInstr &MI) {
916- if (!SIInstrInfo::isVALU (MI))
917- return false ;
918- const SIInstrInfo *TII = ST.getInstrInfo ();
919- if (SIInstrInfo::isSDWA (MI)) {
920- if (auto *DstSel = TII->getNamedOperand (MI, AMDGPU::OpName::dst_sel))
921- if (DstSel->getImm () == AMDGPU::SDWA::DWORD)
922- return false ;
923- } else {
924- if (!AMDGPU::hasNamedOperand (MI.getOpcode (), AMDGPU::OpName::op_sel) ||
925- !(TII->getNamedOperand (MI, AMDGPU::OpName::src0_modifiers)
926- ->getImm () &
927- SISrcMods::DST_OP_SEL))
928- return false ;
929- }
980+ auto IsShift16BitDefFn = [this , VALU](const MachineInstr &ProducerMI) {
930981 const SIRegisterInfo *TRI = ST.getRegisterInfo ();
931- if (auto *Dst = TII->getNamedOperand (MI, AMDGPU::OpName::vdst)) {
932- Register Def = Dst->getReg ();
982+ const MachineOperand *ForwardedDst =
983+ getDstSelForwardingOperand (ProducerMI, ST);
984+ if (ForwardedDst) {
985+ return consumesDstSelForwardingOperand (VALU, ForwardedDst, TRI);
986+ }
933987
934- for (const MachineOperand &Use : VALU->explicit_uses ()) {
935- if (Use.isReg () && TRI->regsOverlap (Def, Use.getReg ()))
988+ if (ProducerMI.isInlineAsm ()) {
989+ // Assume inline asm has dst forwarding hazard
990+ for (auto &Def : ProducerMI.all_defs ()) {
991+ if (consumesDstSelForwardingOperand (VALU, &Def, TRI))
936992 return true ;
937993 }
938994 }
@@ -1029,7 +1085,7 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10291085 // problematic thus far.
10301086
10311087 // see checkVALUHazards()
1032- if (!ST.has12DWordStoreHazard ())
1088+ if (!ST.has12DWordStoreHazard () && !ST. hasDstSelForwardingHazard () )
10331089 return 0 ;
10341090
10351091 const MachineRegisterInfo &MRI = MF.getRegInfo ();
@@ -1038,11 +1094,45 @@ int GCNHazardRecognizer::checkInlineAsmHazards(MachineInstr *IA) {
10381094 for (const MachineOperand &Op :
10391095 llvm::drop_begin (IA->operands (), InlineAsm::MIOp_FirstOperand)) {
10401096 if (Op.isReg () && Op.isDef ()) {
1041- WaitStatesNeeded =
1042- std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1097+ if (!TRI.isVectorRegister (MRI, Op.getReg ()))
1098+ continue ;
1099+
1100+ if (ST.has12DWordStoreHazard ()) {
1101+ WaitStatesNeeded =
1102+ std::max (WaitStatesNeeded, checkVALUHazardsHelper (Op, MRI));
1103+ }
10431104 }
10441105 }
10451106
1107+ if (ST.hasDstSelForwardingHazard ()) {
1108+ const int Shift16DefWaitstates = 1 ;
1109+
1110+ auto IsShift16BitDefFn = [this , &IA](const MachineInstr &ProducerMI) {
1111+ const MachineOperand *Dst = getDstSelForwardingOperand (ProducerMI, ST);
1112+ // Assume inline asm reads the dst
1113+ if (Dst)
1114+ return IA->modifiesRegister (Dst->getReg (), &TRI) ||
1115+ IA->readsRegister (Dst->getReg (), &TRI);
1116+
1117+ if (ProducerMI.isInlineAsm ()) {
1118+ // If MI is inline asm, assume it has dst forwarding hazard
1119+ for (auto &Def : ProducerMI.all_defs ()) {
1120+ if (IA->modifiesRegister (Def.getReg (), &TRI) ||
1121+ IA->readsRegister (Def.getReg (), &TRI)) {
1122+ return true ;
1123+ }
1124+ }
1125+ }
1126+
1127+ return false ;
1128+ };
1129+
1130+ int WaitStatesNeededForDef =
1131+ Shift16DefWaitstates -
1132+ getWaitStatesSince (IsShift16BitDefFn, Shift16DefWaitstates);
1133+ WaitStatesNeeded = std::max (WaitStatesNeeded, WaitStatesNeededForDef);
1134+ }
1135+
10461136 return WaitStatesNeeded;
10471137}
10481138
0 commit comments