diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp index ecf03b14143ee..5207201e14c09 100644 --- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp +++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp @@ -916,21 +916,30 @@ getDstSelForwardingOperand(const MachineInstr &MI, const GCNSubtarget &ST) { if (SIInstrInfo::isSDWA(MI)) { // Type 1: SDWA with dst_sel != DWORD if (auto *DstSel = TII->getNamedOperand(MI, AMDGPU::OpName::dst_sel)) - if (DstSel->getImm() == AMDGPU::SDWA::DWORD) - return nullptr; - } else { - // Type 2 && Type 3: (VOP3 which write the hi bits) || (FP8DstSelInst - // with op_sel[3:2] != 0) - if (!AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel) || - !(TII->getNamedOperand(MI, AMDGPU::OpName::src0_modifiers)->getImm() & - SISrcMods::DST_OP_SEL || - (AMDGPU::isFP8DstSelInst(Opcode) && - (TII->getNamedOperand(MI, AMDGPU::OpName::src2_modifiers)->getImm() & - SISrcMods::OP_SEL_0)))) - return nullptr; - } - - return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + if (DstSel->getImm() != AMDGPU::SDWA::DWORD) + return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + } + + AMDGPU::FPType IsFP4OrFP8ConvOpc = AMDGPU::getFPDstSelType(Opcode); + if (AMDGPU::hasNamedOperand(Opcode, AMDGPU::OpName::op_sel)) { + // Type 2: VOP3 which write the hi bits + if (TII->getNamedImmOperand(MI, AMDGPU::OpName::src0_modifiers) & + SISrcMods::DST_OP_SEL) + return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + // Type 3: FP8DstSelInst with op_sel[3:2] != 0) + if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP8 && + (TII->getNamedImmOperand(MI, AMDGPU::OpName::src2_modifiers) & + SISrcMods::OP_SEL_0)) + return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + } + + // Special case: nop is required for all the opsel values for fp4 sr variant + // cvt scale instructions + if (IsFP4OrFP8ConvOpc == AMDGPU::FPType::FP4) + return TII->getNamedOperand(MI, AMDGPU::OpName::vdst); + + return nullptr; } /// Checks whether the provided \p MI "consumes" the operand with a Dest sel diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 7bc6db4cec106..bb78e77a9dc1a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2567,6 +2567,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit IsFP8SrcByteSel = 0; field bit IsFP8DstByteSel = 0; field bit HasFP8DstByteSel = 0; + field bit HasFP4DstByteSel = 0; field bit IsFP8ByteSel = !or(IsFP8SrcByteSel, IsFP8DstByteSel); field bit HasDst = !ne(DstVT.Value, untyped.Value); @@ -3249,13 +3250,13 @@ def isMFMA_F8F6F4Table : GenericTable { let PrimaryKeyName = "isMFMA_F8F6F4" ; } -def FP8DstByteSelTable : GenericTable { +def FP4FP8DstByteSelTable : GenericTable { let FilterClass = "VOP3_Pseudo"; - let CppTypeName = "FP8DstByteSelInfo"; - let Fields = ["Opcode", "HasFP8DstByteSel"]; + let CppTypeName = "FP4FP8DstByteSelInfo"; + let Fields = ["Opcode", "HasFP8DstByteSel", "HasFP4DstByteSel"]; let PrimaryKey = ["Opcode"]; - let PrimaryKeyName = "getFP8DstByteSelHelper"; + let PrimaryKeyName = "getFP4FP8DstByteSelHelper"; } def VOPDComponentTable : GenericTable { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 5a0e812748fbb..61866f3345e14 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -378,17 +378,18 @@ struct VOPTrue16Info { bool IsTrue16; }; -#define GET_FP8DstByteSelTable_DECL -#define GET_FP8DstByteSelTable_IMPL +#define GET_FP4FP8DstByteSelTable_DECL +#define GET_FP4FP8DstByteSelTable_IMPL struct DPMACCInstructionInfo { uint16_t Opcode; bool IsDPMACCInstruction; }; -struct FP8DstByteSelInfo { +struct FP4FP8DstByteSelInfo { uint16_t Opcode; bool HasFP8DstByteSel; + bool HasFP4DstByteSel; }; #define GET_FP8DstByteSelTable_DECL @@ -657,9 +658,16 @@ bool isTrue16Inst(unsigned Opc) { return Info ? Info->IsTrue16 : false; } -bool isFP8DstSelInst(unsigned Opc) { - const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc); - return Info ? Info->HasFP8DstByteSel : false; +FPType getFPDstSelType(unsigned Opc) { + const FP4FP8DstByteSelInfo *Info = getFP4FP8DstByteSelHelper(Opc); + if (!Info) + return FPType::None; + if (Info->HasFP8DstByteSel) + return FPType::FP8; + if (Info->HasFP4DstByteSel) + return FPType::FP4; + + return FPType::None; } unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index ea497d7b239d7..29f64d0db8dd2 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -55,6 +55,8 @@ static constexpr unsigned GFX12 = 1; enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5, AMDHSA_COV6 = 6 }; +enum class FPType { None, FP4, FP8 }; + /// \returns True if \p STI is AMDHSA. bool isHsaAbi(const MCSubtargetInfo &STI); @@ -885,7 +887,7 @@ LLVM_READONLY bool isTrue16Inst(unsigned Opc); LLVM_READONLY -bool isFP8DstSelInst(unsigned Opc); +FPType getFPDstSelType(unsigned Opc); LLVM_READONLY bool isInvalidSingleUseConsumerInst(unsigned Opc); diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 47b60bb0fdab3..7566cca4a295c 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -1014,7 +1014,7 @@ class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile : let HasExtVOP3DPP = 0; let HasOpSel = 1; let HasOMod = 0; - let HasFP8DstByteSel = 1; + let HasFP4DstByteSel = 1; } def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile, VOP3_OPSEL> { @@ -1026,7 +1026,7 @@ def VOP3_CVT_SCALE_SR_PK_F4_F32_TiedInput_Profile : VOP3_Profile : VOP3_Profile, diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 0e19696a32f86..c38ec3ba89727 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -110,6 +110,7 @@ class VOP3_Pseudo pattern = [], let IsSWMMAC = P.IsSWMMAC; bit HasFP8DstByteSel = P.HasFP8DstByteSel; + bit HasFP4DstByteSel = P.HasFP4DstByteSel; let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel), P.AsmVOP3OpSel, diff --git a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir index 1bbad901d16b2..49576433ab54d 100644 --- a/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir +++ b/llvm/test/CodeGen/AMDGPU/hazards-gfx950.mir @@ -642,17 +642,18 @@ body: | ... --- -name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard +name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 - ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_neg_opsel0_hazard + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_bf16_opsel0_hazard ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4 ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_WAITCNT 0 ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: S_WAITCNT 3952 ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_BF16_e64 0, killed $vgpr2, 0, killed $vgpr3, 0, killed $vgpr4, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 @@ -731,17 +732,18 @@ body: | ... --- -name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard +name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 - ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_neg_opsel0_hazard + ; GCN-LABEL: name: test_scalef32_sr_pk_fp4_f32_opsel0_hazard ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5 ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_WAITCNT 0 ; GCN-NEXT: renamable $vgpr0 = GLOBAL_LOAD_DWORD killed renamable $vgpr0_vgpr1, 0, 0, implicit $exec ; GCN-NEXT: S_WAITCNT 3952 ; GCN-NEXT: early-clobber renamable $vgpr1 = V_CVT_SCALEF32_SR_PK_FP4_F32_e64 0, killed $vgpr2_vgpr3, 0, killed $vgpr4, 0, killed $vgpr5, killed $vgpr0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: renamable $vgpr0 = V_ADD_U32_e32 killed $vgpr1, $vgpr1, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0 @@ -1119,17 +1121,18 @@ body: | ... --- -name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard +name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard body: | bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_neg_opsel0_hazard + ; GCN-LABEL: name: test_cvt_scale_cvt_scalef32_sr_pk_fp4_f16_opsel0_hazard ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 ; GCN-NEXT: {{ $}} ; GCN-NEXT: S_WAITCNT 0 ; GCN-NEXT: renamable $vgpr2 = V_CVT_SCALEF32_PK_FP4_F16_e64 8, $vgpr0, 0, $vgpr1, 4, killed $vgpr2, 0, implicit $mode, implicit $exec ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: early-clobber renamable $vgpr4 = V_CVT_SCALEF32_SR_PK_FP4_F16_e64 0, killed $vgpr0, 0, killed $vgpr3, 0, killed $vgpr1, killed $vgpr2, 0, implicit $mode, implicit $exec + ; GCN-NEXT: S_NOP 0 ; GCN-NEXT: $vgpr0 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; GCN-NEXT: S_SETPC_B64_return undef $sgpr30_sgpr31, implicit killed $vgpr0 S_WAITCNT 0