From 25071dc25f2c464b8b5395ff8044846f188932bc Mon Sep 17 00:00:00 2001 From: guochen2 Date: Thu, 31 Oct 2024 13:35:20 -0400 Subject: [PATCH 1/2] [AMDGPU][True16][MC] VOP2 update instructions with fake16 format --- llvm/lib/Target/AMDGPU/SIFoldOperands.cpp | 2 +- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 45 +++---- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 15 +++ llvm/lib/Target/AMDGPU/SIInstructions.td | 6 +- .../Target/AMDGPU/SIShrinkInstructions.cpp | 4 +- .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 4 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 124 +++++++++++------- .../CodeGen/AMDGPU/fix-sgpr-copies-f16.mir | 2 +- .../test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir | 8 +- llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir | 8 +- 10 files changed, 130 insertions(+), 88 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp index f0c7837e0bb75..0b8be0d88170c 100644 --- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp +++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp @@ -176,7 +176,7 @@ static unsigned macToMad(unsigned Opc) { return AMDGPU::V_FMA_F32_e64; case AMDGPU::V_FMAC_F16_e64: return AMDGPU::V_FMA_F16_gfx9_e64; - case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: return AMDGPU::V_FMA_F16_gfx9_e64; case AMDGPU::V_FMAC_LEGACY_F32_e64: return AMDGPU::V_FMA_LEGACY_F32_e64; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index 89a2eb4f18946..d19a7efb8dc23 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3480,7 +3480,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_t16_e64) { + Opc == AMDGPU::V_FMAC_F16_fake16_e64) { // Don't fold if we are using source or output modifiers. The new VOP2 // instructions don't have them. if (hasAnyModifiersSet(UseMI)) @@ -3500,7 +3500,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, bool IsFMA = Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_t16_e64; + Opc == AMDGPU::V_FMAC_F16_fake16_e64; MachineOperand *Src1 = getNamedOperand(UseMI, AMDGPU::OpName::src1); MachineOperand *Src2 = getNamedOperand(UseMI, AMDGPU::OpName::src2); @@ -3533,16 +3533,16 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned NewOpc = IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 + : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 : AMDGPU::V_FMAMK_F16) : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAMK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite + // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite // would also require restricting their register classes. For now // just bail out. - if (NewOpc == AMDGPU::V_FMAMK_F16_t16) + if (NewOpc == AMDGPU::V_FMAMK_F16_fake16) return false; const int64_t Imm = getImmFor(RegSrc == Src1 ? *Src0 : *Src1); @@ -3557,8 +3557,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Src0->setIsKill(RegSrc->isKill()); if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_e64) + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3612,24 +3612,24 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned NewOpc = IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32 - : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 + : ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 : AMDGPU::V_FMAAK_F16) : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16); if (pseudoToMCOpcode(NewOpc) == -1) return false; - // V_FMAAK_F16_t16 takes VGPR_32_Lo128 operands, so the rewrite + // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite // would also require restricting their register classes. For now // just bail out. - if (NewOpc == AMDGPU::V_FMAAK_F16_t16) + if (NewOpc == AMDGPU::V_FMAAK_F16_fake16) return false; // FIXME: This would be a lot easier if we could return a new instruction // instead of having to modify in place. if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 || - Opc == AMDGPU::V_FMAC_F16_e64) + Opc == AMDGPU::V_FMAC_F32_e64 || + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64) UseMI.untieRegOperand( AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src2)); @@ -3852,19 +3852,20 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return MIB; } - assert(Opc != AMDGPU::V_FMAC_F16_t16_e32 && - "V_FMAC_F16_t16_e32 is not supported and not expected to be present " - "pre-RA"); + assert( + Opc != AMDGPU::V_FMAC_F16_fake16_e32 && + "V_FMAC_F16_fake16_e32 is not supported and not expected to be present " + "pre-RA"); // Handle MAC/FMAC. bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_t16_e64; + Opc == AMDGPU::V_FMAC_F16_fake16_e64; bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 || Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 || Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 || - Opc == AMDGPU::V_FMAC_F16_t16_e64 || + Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64; bool IsLegacy = Opc == AMDGPU::V_MAC_LEGACY_F32_e32 || @@ -3878,7 +3879,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, return nullptr; case AMDGPU::V_MAC_F16_e64: case AMDGPU::V_FMAC_F16_e64: - case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: case AMDGPU::V_FMAC_F32_e64: @@ -3963,7 +3964,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, int64_t Imm; if (!Src0Literal && getFoldableImm(Src2, Imm, &DefMI)) { unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 + IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 : AMDGPU::V_FMAAK_F16) : AMDGPU::V_FMAAK_F32) : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32); @@ -3982,7 +3983,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI, } } unsigned NewOpc = - IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 + IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 : AMDGPU::V_FMAMK_F16) : AMDGPU::V_FMAMK_F32) : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32); @@ -4437,7 +4438,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI, case AMDGPU::V_MAC_F32_e64: case AMDGPU::V_MAC_LEGACY_F32_e64: case AMDGPU::V_FMAC_F16_e64: - case AMDGPU::V_FMAC_F16_t16_e64: + case AMDGPU::V_FMAC_F16_fake16_e64: case AMDGPU::V_FMAC_F32_e64: case AMDGPU::V_FMAC_F64_e64: case AMDGPU::V_FMAC_LEGACY_F32_e64: @@ -5484,7 +5485,7 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const { case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64; case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64; case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64; - case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_t16_e64; + case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64; case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32; case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32; case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index fce50b741bb63..9f8e6a082d965 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -1755,6 +1755,21 @@ class getVOP3SrcForVT { 1 : VSrc_b32); } +// Returns the vreg register class to use for sources of VOP3 instructions for the +// given VT. +class getVOP3VRegSrcForVT { + RegisterOperand ret = + !cond(!eq(VT.Size, 128) : RegisterOperand, + !eq(VT.Size, 96) : RegisterOperand, + !eq(VT.Size, 64) : RegisterOperand, + !eq(VT.Size, 48) : RegisterOperand, + !eq(VT.Size, 16) : !if(IsTrue16, + !if(IsFake16, RegisterOperand, + RegisterOperand), + RegisterOperand), + 1 : RegisterOperand); +} + // Src2 of VOP3 DPP instructions cannot be a literal class getVOP3DPPSrcForVT { RegisterOperand ret = diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c8a46217190a1..c4977f1fb2aec 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3200,7 +3200,7 @@ def : GCNPat < let SubtargetPredicate = isGFX10Plus in { // Don't allow source modifiers. If there are any source modifiers then it's // better to select fma instead of fmac. -let OtherPredicates = [NotHasTrue16BitInsts] in +let True16Predicate = NotHasTrue16BitInsts in def : GCNPat < (fma (f16 (VOP3NoMods f32:$src0)), (f16 (VOP3NoMods f32:$src1)), @@ -3208,12 +3208,12 @@ def : GCNPat < (V_FMAC_F16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >; -let OtherPredicates = [HasTrue16BitInsts] in +let True16Predicate = UseFakeTrue16Insts in def : GCNPat < (fma (f16 (VOP3NoMods f32:$src0)), (f16 (VOP3NoMods f32:$src1)), (f16 (VOP3NoMods f32:$src2))), - (V_FMAC_F16_t16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, + (V_FMAC_F16_fake16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >; } diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp index f0b0e378ad668..42df4576a774d 100644 --- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp +++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp @@ -455,7 +455,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_t16 + NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAAK_F16_fake16 : AMDGPU::V_FMAAK_F16; break; } @@ -484,7 +484,7 @@ void SIShrinkInstructions::shrinkMadFma(MachineInstr &MI) const { break; case AMDGPU::V_FMA_F16_e64: case AMDGPU::V_FMA_F16_gfx9_e64: - NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_t16 + NewOpcode = ST->hasTrue16BitInsts() ? AMDGPU::V_FMAMK_F16_fake16 : AMDGPU::V_FMAMK_F16; break; } diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 20a81a3135f0b..e3d7786cbe6b9 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -563,8 +563,8 @@ bool isMAC(unsigned Opc) { Opc == AMDGPU::V_FMAC_LEGACY_F32_e64_gfx10 || Opc == AMDGPU::V_FMAC_DX9_ZERO_F32_e64_gfx11 || Opc == AMDGPU::V_FMAC_F16_e64_gfx10 || - Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx11 || - Opc == AMDGPU::V_FMAC_F16_t16_e64_gfx12 || + Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx11 || + Opc == AMDGPU::V_FMAC_F16_fake16_e64_gfx12 || Opc == AMDGPU::V_DOT2C_F32_F16_e64_vi || Opc == AMDGPU::V_DOT2C_I32_I16_e64_vi || Opc == AMDGPU::V_DOT4C_I32_I8_e64_vi || diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index fbde3bb7d1411..925b60561c9d6 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -95,6 +95,7 @@ class VOP2_Real : VOP_MADK_Base { } def VOP_MADAK_F16 : VOP_MADAK ; -def VOP_MADAK_F16_t16 : VOP_MADAK { +def VOP_MADAK_F16_fake16 : VOP_MADAK { let IsTrue16 = 1; - let DstRC = VOPDstOperand; - let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPR_32_Lo128:$src1, ImmOpType:$imm); + let DstRC = getVALUDstForVT_fake16.ret; + let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, VGPRSrc_32_Lo128:$src1, ImmOpType:$imm); } def VOP_MADAK_F32 : VOP_MADAK ; @@ -398,10 +399,10 @@ class VOP_MADMK : VOP_MADK_Base { } def VOP_MADMK_F16 : VOP_MADMK ; -def VOP_MADMK_F16_t16 : VOP_MADMK { +def VOP_MADMK_F16_fake16 : VOP_MADMK { let IsTrue16 = 1; - let DstRC = VOPDstOperand; - let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPR_32_Lo128:$src1); + let DstRC = getVALUDstForVT_fake16.ret; + let Ins32 = (ins VSrcFake16_f16_Lo128_Deferred:$src0, ImmOpType:$imm, VGPRSrc_32_Lo128:$src1); } def VOP_MADMK_F32 : VOP_MADMK ; @@ -409,7 +410,9 @@ def VOP_MADMK_F32 : VOP_MADMK ; // and processing time but it makes it easier to convert to mad. class VOP_MAC : VOPProfile <[vt0, vt1, vt1, vt0]> { let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT.ret:$src2); - let Ins64 = getIns64.ret, 3, + // Src2 must accept the same operand types as vdst, namely VGPRs only + let Src2RC64 = getVOP3VRegSrcForVT.ret; + let Ins64 = getIns64.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, @@ -464,21 +467,18 @@ class VOP_MAC : VOPProfile <[vt0, vt1, vt1, v } def VOP_MAC_F16 : VOP_MAC ; -def VOP_MAC_F16_t16 : VOP_MAC { +def VOP_MAC_F16_fake16 : VOP_MAC { let IsTrue16 = 1; - let HasOpSel = 1; - let AsmVOP3OpSel = getAsmVOP3OpSel<2/*NumSrcArgs*/, HasClamp, HasOMod, - HasSrc0FloatMods, HasSrc1FloatMods, HasSrc2FloatMods>.ret; - let DstRC = VOPDstOperand; - let DstRC64 = VOPDstOperand; - let Src1RC32 = VGPRSrc_32_Lo128; + let DstRC = getVALUDstForVT_fake16.ret; + let Src0RC32 = getVOPSrc0ForVT.ret; + let Src1RC32 = getVregSrcForVT.ret; let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT.ret:$src2); let Src0DPP = getVregSrcForVT.ret; let Src1DPP = getVregSrcForVT.ret; let Src2DPP = getVregSrcForVT.ret; - let Src0ModDPP = getSrcModDPP_t16.ret; - let Src1ModDPP = getSrcModDPP_t16.ret; - let Src2ModDPP = getSrcModDPP_t16.ret; + let Src0ModDPP = getSrcModDPP_t16.ret; + let Src1ModDPP = getSrcModDPP_t16.ret; + let Src2ModDPP = getSrcModDPP_t16.ret; let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0, Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT.ret:$src2, // stub argument @@ -488,10 +488,18 @@ def VOP_MAC_F16_t16 : VOP_MAC { Src1ModDPP:$src1_modifiers, Src1DPP:$src1, getVregSrcForVT.ret:$src2, // stub argument dpp8:$dpp8, Dpp8FI:$fi); - let Src2Mod = FP32InputMods; // dummy unused modifiers - let Src2RC64 = VGPRSrc_32; // stub argument + let DstRC64 = getVALUDstForVT.ret; + let Src0VOP3DPP = VGPRSrc_32; + let Src1VOP3DPP = getVOP3DPPSrcForVT.ret; + let Src2VOP3DPP = getVOP3DPPSrcForVT.ret; + let Src0ModVOP3DPP = getSrc0ModVOP3DPP.ret; let Src1ModVOP3DPP = getSrcModVOP3DPP.ret; + let Src2ModVOP3DPP = getSrcModVOP3DPP.ret; + let Src0Mod = getSrc0Mod.ret; + let Src1Mod = getSrcMod.ret; + let Src2Mod = getSrcMod.ret; } + def VOP_MAC_F32 : VOP_MAC ; let HasExtDPP = 0, HasExt32BitDPP = 0 in def VOP_MAC_LEGACY_F32 : VOP_MAC ; @@ -650,15 +658,18 @@ class VOP2e_SGPR ArgVT> : VOPProfile { } def VOP2e_I32_I32_I32_I1 : VOP2e_SGPR<[i32, i32, i32, i1]>; +def VOP2e_I16_I16_I16_I1 : VOP2e_SGPR<[i16, i16, i16, i1]>; +// V_CNDMASK_B16 is VOP3 only def VOP2e_I16_I16_I16_I1_fake16 : VOP2e_SGPR<[i16, i16, i16, i1]> { let IsTrue16 = 1; let DstRC64 = getVALUDstForVT.ret; - let Src0Mod = getSrcMod.ret; - let Src1Mod = getSrcMod.ret; + let Src0Mod = getSrc0Mod.ret; + let Src1Mod = getSrcMod.ret; let Src0VOP3DPP = VGPRSrc_32; - let Src1VOP3DPP = getVOP3DPPSrcForVT.ret; + let Src1VOP3DPP = getVOP3DPPSrcForVT.ret; + let Src0ModVOP3DPP = getSrc0ModVOP3DPP.ret; let Src1ModVOP3DPP = getSrcModVOP3DPP.ret; } @@ -924,7 +935,6 @@ let FPDPRounding = 1 in { let SubtargetPredicate = UseFakeTrue16Insts in defm V_LDEXP_F16_fake16 : VOP2Inst <"v_ldexp_f16_fake16", LDEXP_F16_VOPProfile_Fake16, null_frag, "v_ldexp_f16_fake16">; } // End FPDPRounding = 1 -// FIXME VOP3 Only instructions. NFC using VOPProfile_True16 for these until a planned change to use a new register class for VOP3 encoded True16 instuctions defm V_LSHLREV_B16 : VOP2Inst_e64_t16 <"v_lshlrev_b16", VOP_I16_I16_I16, clshl_rev_16>; defm V_LSHRREV_B16 : VOP2Inst_e64_t16 <"v_lshrrev_b16", VOP_I16_I16_I16, clshr_rev_16>; defm V_ASHRREV_I16 : VOP2Inst_e64_t16 <"v_ashrrev_i16", VOP_I16_I16_I16, cashr_rev_16>; @@ -986,18 +996,18 @@ let SubtargetPredicate = isGFX11Plus in { let FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 in { let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in { -def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; + def V_FMAMK_F16 : VOP2_Pseudo <"v_fmamk_f16", VOP_MADMK_F16, [], "">; } -let SubtargetPredicate = HasTrue16BitInsts in { -def V_FMAMK_F16_t16 : VOP2_Pseudo <"v_fmamk_f16_t16", VOP_MADMK_F16_t16, [], "">; +let True16Predicate = UseFakeTrue16Insts in { + def V_FMAMK_F16_fake16 : VOP2_Pseudo <"v_fmamk_f16_fake16", VOP_MADMK_F16_fake16, [], "">; } let isCommutable = 1 in { let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in { -def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">; + def V_FMAAK_F16 : VOP2_Pseudo <"v_fmaak_f16", VOP_MADAK_F16, [], "">; } -let SubtargetPredicate = HasTrue16BitInsts in { -def V_FMAAK_F16_t16 : VOP2_Pseudo <"v_fmaak_f16_t16", VOP_MADAK_F16_t16, [], "">; +let True16Predicate = UseFakeTrue16Insts in { + def V_FMAAK_F16_fake16 : VOP2_Pseudo <"v_fmaak_f16_fake16", VOP_MADAK_F16_fake16, [], "">; } } // End isCommutable = 1 } // End FPDPRounding = 1, isReMaterializable = 1, FixedSize = 1 @@ -1006,22 +1016,24 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2", isConvertibleToThreeAddress = 1, isCommutable = 1 in { -let SubtargetPredicate = isGFX10Plus, True16Predicate = NotHasTrue16BitInsts in { -defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>; +let SubtargetPredicate = isGFX10Plus in { +let True16Predicate = NotHasTrue16BitInsts in { + defm V_FMAC_F16 : VOP2Inst <"v_fmac_f16", VOP_MAC_F16>; } -let SubtargetPredicate = HasTrue16BitInsts in { -defm V_FMAC_F16_t16 : VOP2Inst <"v_fmac_f16_t16", VOP_MAC_F16_t16>; +let True16Predicate = UseFakeTrue16Insts in { + defm V_FMAC_F16_fake16 : VOP2Inst <"v_fmac_f16_fake16", VOP_MAC_F16_fake16>; } +} // End SubtargetPredicate = isGFX10Plus } // End FMAC Constraints let SubtargetPredicate = Has16BitInsts in { let isReMaterializable = 1 in { let FPDPRounding = 1 in { -def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; + def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; } // End FPDPRounding = 1 let isCommutable = 1 in { let mayRaiseFPException = 0 in { -def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; + def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; } let SubtargetPredicate = isGFX8GFX9 in { defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>; @@ -1576,14 +1588,20 @@ multiclass VOP2_Real_FULL_with_name_gfx12 op, string opName, string asmName> : VOP2_Real_FULL_with_name; -multiclass VOP2_Real_FULL_t16_with_name_gfx12 op, string opName, - string asmName, string alias> { +multiclass VOP2_Real_FULL_t16_gfx12 op, string opName, + string asmName, string alias> { defm NAME : VOP2_Real_FULL_with_name; def _gfx12_2nd_alias : AMDGPUMnemonicAlias { let AssemblerPredicate = isGFX12Only; } } +multiclass VOP2_Real_FULL_t16_and_fake16_gfx12 op, string opName, + string asmName, string alias> { + defm _t16: VOP2_Real_FULL_t16_gfx12; + defm _fake16: VOP2_Real_FULL_t16_gfx12; +} + multiclass VOP2_Real_NO_DPP_with_name_gfx12 op, string opName, string asmName> : VOP2_Real_NO_DPP_with_name; @@ -1607,10 +1625,8 @@ defm V_SUBREV_CO_CI_U32 : defm V_MIN_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x015, "V_MIN_F32", "v_min_num_f32">; defm V_MAX_NUM_F32 : VOP2_Real_FULL_with_name_gfx12<0x016, "V_MAX_F32", "v_max_num_f32">; -defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_t16", "v_min_num_f16", "v_min_f16">; -defm V_MIN_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x030, "V_MIN_F16_fake16", "v_min_num_f16", "v_min_f16">; -defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_t16", "v_max_num_f16", "v_max_f16">; -defm V_MAX_NUM_F16_fake16 : VOP2_Real_FULL_t16_with_name_gfx12<0x031, "V_MAX_F16_fake16", "v_max_num_f16", "v_max_f16">; +defm V_MIN_NUM_F16 : VOP2_Real_FULL_t16_and_fake16_gfx12<0x030, "V_MIN_F16", "v_min_num_f16", "v_min_f16">; +defm V_MAX_NUM_F16 : VOP2_Real_FULL_t16_and_fake16_gfx12<0x031, "V_MAX_F16", "v_max_num_f16", "v_max_f16">; let SubtargetPredicate = isGFX12Plus in { defm : VOP2eInstAliases; @@ -1645,6 +1661,14 @@ multiclass VOP2_Real_NO_VOP3_with_name_gfx11 op, string opName, } } +multiclass VOP2_Real_FULL_t16_gfx11 op, string asmName, string opName = NAME> : + VOP2_Real_FULL_with_name; + +multiclass VOP2_Real_FULL_t16_and_fake16_gfx11 op, string asmName, string opName = NAME> { + defm opName#"_t16": VOP2_Real_FULL_t16_gfx11; + defm opName#"_fake16": VOP2_Real_FULL_t16_gfx11; +} + multiclass VOP2_Real_NO_DPP_with_name_gfx11 op, string opName, string asmName> : VOP2_Real_NO_DPP_with_name; @@ -1675,14 +1699,16 @@ multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12 op, string as multiclass VOP3beOnly_Realtriple_gfx11_gfx12 op> : VOP3beOnly_Realtriple, VOP3beOnly_Realtriple; -multiclass VOP2Only_Real_MADK_with_name_gfx11_gfx12 op, string asmName, - string opName = NAME> : +multiclass VOP2Only_Real_MADK_t16_gfx11_gfx12 op, string asmName, + string opName = NAME> : VOP2Only_Real_MADK_with_name, VOP2Only_Real_MADK_with_name; -multiclass VOP2_Real_FULL_t16_gfx11 op, string asmName, - string opName = NAME> : - VOP2_Real_FULL_with_name; +multiclass VOP2Only_Real_MADK_t16_and_fake16_gfx11_gfx12 op, string asmName, + string opName = NAME> { + defm _t16: VOP2Only_Real_MADK_t16_gfx11_gfx12; + defm _fake16: VOP2Only_Real_MADK_t16_gfx11_gfx12; +} multiclass VOP2_Real_FULL_t16_gfx11_gfx12 op, string asmName, string opName = NAME> : @@ -1721,15 +1747,15 @@ defm V_SUBREV_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16 defm V_SUBREV_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x034, "v_subrev_f16">; defm V_MUL_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">; defm V_MUL_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x035, "v_mul_f16">; -defm V_FMAC_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">; +defm V_FMAC_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x036, "v_fmac_f16">; defm V_LDEXP_F16_t16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">; defm V_LDEXP_F16_fake16 : VOP2_Real_FULL_t16_gfx11_gfx12<0x03b, "v_ldexp_f16">; defm V_MAX_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">; defm V_MAX_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x039, "v_max_f16">; defm V_MIN_F16_t16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">; defm V_MIN_F16_fake16 : VOP2_Real_FULL_t16_gfx11<0x03a, "v_min_f16">; -defm V_FMAMK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x037, "v_fmamk_f16">; -defm V_FMAAK_F16_t16 : VOP2Only_Real_MADK_with_name_gfx11_gfx12<0x038, "v_fmaak_f16">; +defm V_FMAMK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x037, "v_fmamk_f16">; +defm V_FMAAK_F16_fake16 : VOP2Only_Real_MADK_t16_gfx11_gfx12<0x038, "v_fmaak_f16">; // VOP3 only. defm V_CNDMASK_B16 : VOP3Only_Realtriple_gfx11_gfx12<0x25d>; diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir index e8291f7ab8f72..ac7944f25fe37 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir @@ -12,7 +12,7 @@ body: | ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF ; GCN-NEXT: [[V_CVT_F32_U32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec ; GCN-NEXT: [[DEF3:%[0-9]+]]:sreg_32 = IMPLICIT_DEF - ; GCN-NEXT: [[V_FMAC_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_t16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, 0, implicit $mode, implicit $exec + ; GCN-NEXT: [[V_FMAC_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FMAC_F16_fake16_e64 0, killed [[DEF1]], 0, [[DEF2]], 0, [[V_CVT_F32_U32_e64_]], 0, 0, implicit $mode, implicit $exec %0:vgpr_32 = IMPLICIT_DEF %1:sreg_32 = IMPLICIT_DEF %2:sreg_32 = IMPLICIT_DEF diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir index 820b8579bd0a4..cefd24032871f 100644 --- a/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/gfx11-twoaddr-fma.mir @@ -23,7 +23,7 @@ body: | %1 = COPY %0.sub1 %2 = COPY %0.sub0 %3 = V_MOV_B32_e32 1078523331, implicit $exec - %4 = V_FMAC_F16_t16_e64 0, killed %2, 0, %3, 0, killed %1, 0, 0, 0, implicit $mode, implicit $exec + %4 = V_FMAC_F16_fake16_e64 0, killed %2, 0, %3, 0, killed %1, 0, 0, implicit $mode, implicit $exec ... @@ -48,7 +48,7 @@ body: | %1 = COPY %0.sub1 %2 = COPY %0.sub0 %3 = V_MOV_B32_e32 1078523331, implicit $exec - %4 = V_FMAC_F16_t16_e64 0, %2, 0, killed %3, 0, killed %1, 0, 0, 0, implicit $mode, implicit $exec + %4 = V_FMAC_F16_fake16_e64 0, %2, 0, killed %3, 0, killed %1, 0, 0, implicit $mode, implicit $exec ... @@ -73,7 +73,7 @@ body: | %1 = COPY %0.sub0 %2 = COPY %0.sub1 %3 = V_MOV_B32_e32 1078523331, implicit $exec - %4 = V_FMAC_F16_t16_e64 0, killed %1, 0, %2, 0, %3, 0, 0, 0, implicit $mode, implicit $exec + %4 = V_FMAC_F16_fake16_e64 0, killed %1, 0, %2, 0, %3, 0, 0, implicit $mode, implicit $exec ... --- @@ -95,7 +95,7 @@ body: | %0:vgpr_32 = COPY killed $vgpr0 %1:vgpr_32 = V_MOV_B32_e32 49664, implicit $exec - %2:vgpr_32 = V_FMAC_F16_t16_e64 0, 16384, 0, killed %0, 0, %1, 0, 0, 0, implicit $mode, implicit $exec + %2:vgpr_32 = V_FMAC_F16_fake16_e64 0, 16384, 0, killed %0, 0, %1, 0, 0, implicit $mode, implicit $exec S_ENDPGM 0 ... diff --git a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir index ed2148ab5a198..26feb8120c751 100644 --- a/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir +++ b/llvm/test/CodeGen/AMDGPU/shrink-mad-fma.mir @@ -267,7 +267,7 @@ body: | ; GFX11-LABEL: name: fma_cvv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_t16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF @@ -288,7 +288,7 @@ body: | ; GFX11-LABEL: name: fma_vcv_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_t16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2 = V_FMAMK_F16_fake16 $vgpr0, 18688, $vgpr1, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF @@ -309,7 +309,7 @@ body: | ; GFX11-LABEL: name: fma_vvc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $vgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_t16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $vgpr1 = IMPLICIT_DEF @@ -330,7 +330,7 @@ body: | ; GFX11-LABEL: name: fma_vsc_f16 ; GFX11: $vgpr0 = IMPLICIT_DEF ; GFX11-NEXT: $sgpr1 = IMPLICIT_DEF - ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_t16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec + ; GFX11-NEXT: $vgpr2 = V_FMAAK_F16_fake16 $vgpr0, $vgpr1, 18688, implicit $mode, implicit $exec ; GFX11-NEXT: SI_RETURN implicit $vgpr2 $vgpr0 = IMPLICIT_DEF $sgpr1 = IMPLICIT_DEF From 95d23b91b3460d07ebfbfe8c1b7ad89f36cfb417 Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Mon, 4 Nov 2024 17:52:24 -0500 Subject: [PATCH 2/2] Update SIInstructions.td --- llvm/lib/Target/AMDGPU/SIInstructions.td | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index c4977f1fb2aec..52df38c352cf5 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -3210,9 +3210,9 @@ def : GCNPat < >; let True16Predicate = UseFakeTrue16Insts in def : GCNPat < - (fma (f16 (VOP3NoMods f32:$src0)), - (f16 (VOP3NoMods f32:$src1)), - (f16 (VOP3NoMods f32:$src2))), + (fma (f16 (VOP3NoMods f16:$src0)), + (f16 (VOP3NoMods f16:$src1)), + (f16 (VOP3NoMods f16:$src2))), (V_FMAC_F16_fake16_e64 SRCMODS.NONE, $src0, SRCMODS.NONE, $src1, SRCMODS.NONE, $src2) >;