@@ -3461,6 +3461,62 @@ std::optional<int64_t> SIInstrInfo::extractSubregFromImm(int64_t Imm,
34613461 llvm_unreachable (" covered subregister switch" );
34623462}
34633463
3464+ static unsigned getNewFMAAKInst (const GCNSubtarget &ST, unsigned Opc) {
3465+ switch (Opc) {
3466+ case AMDGPU::V_MAC_F16_e32:
3467+ case AMDGPU::V_MAC_F16_e64:
3468+ case AMDGPU::V_MAD_F16_e64:
3469+ return AMDGPU::V_MADAK_F16;
3470+ case AMDGPU::V_MAC_F32_e32:
3471+ case AMDGPU::V_MAC_F32_e64:
3472+ case AMDGPU::V_MAD_F32_e64:
3473+ return AMDGPU::V_MADAK_F32;
3474+ case AMDGPU::V_FMAC_F32_e32:
3475+ case AMDGPU::V_FMAC_F32_e64:
3476+ case AMDGPU::V_FMA_F32_e64:
3477+ return AMDGPU::V_FMAAK_F32;
3478+ case AMDGPU::V_FMAC_F16_e32:
3479+ case AMDGPU::V_FMAC_F16_e64:
3480+ case AMDGPU::V_FMAC_F16_t16_e64:
3481+ case AMDGPU::V_FMAC_F16_fake16_e64:
3482+ case AMDGPU::V_FMA_F16_e64:
3483+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3484+ ? AMDGPU::V_FMAAK_F16_t16
3485+ : AMDGPU::V_FMAAK_F16_fake16
3486+ : AMDGPU::V_FMAAK_F16;
3487+ default :
3488+ llvm_unreachable (" invalid instruction" );
3489+ }
3490+ }
3491+
3492+ static unsigned getNewFMAMKInst (const GCNSubtarget &ST, unsigned Opc) {
3493+ switch (Opc) {
3494+ case AMDGPU::V_MAC_F16_e32:
3495+ case AMDGPU::V_MAC_F16_e64:
3496+ case AMDGPU::V_MAD_F16_e64:
3497+ return AMDGPU::V_MADMK_F16;
3498+ case AMDGPU::V_MAC_F32_e32:
3499+ case AMDGPU::V_MAC_F32_e64:
3500+ case AMDGPU::V_MAD_F32_e64:
3501+ return AMDGPU::V_MADMK_F32;
3502+ case AMDGPU::V_FMAC_F32_e32:
3503+ case AMDGPU::V_FMAC_F32_e64:
3504+ case AMDGPU::V_FMA_F32_e64:
3505+ return AMDGPU::V_FMAMK_F32;
3506+ case AMDGPU::V_FMAC_F16_e32:
3507+ case AMDGPU::V_FMAC_F16_e64:
3508+ case AMDGPU::V_FMAC_F16_t16_e64:
3509+ case AMDGPU::V_FMAC_F16_fake16_e64:
3510+ case AMDGPU::V_FMA_F16_e64:
3511+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3512+ ? AMDGPU::V_FMAMK_F16_t16
3513+ : AMDGPU::V_FMAMK_F16_fake16
3514+ : AMDGPU::V_FMAMK_F16;
3515+ default :
3516+ llvm_unreachable (" invalid instruction" );
3517+ }
3518+ }
3519+
34643520bool SIInstrInfo::foldImmediate (MachineInstr &UseMI, MachineInstr &DefMI,
34653521 Register Reg, MachineRegisterInfo *MRI) const {
34663522 if (!MRI->hasOneNonDBGUse (Reg))
@@ -3533,6 +3589,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35333589 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
35343590 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35353591 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3592+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35363593 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
35373594 // Don't fold if we are using source or output modifiers. The new VOP2
35383595 // instructions don't have them.
@@ -3555,6 +3612,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35553612 bool IsFMA =
35563613 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35573614 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3615+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35583616 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35593617 MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
35603618 MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3586,18 +3644,15 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35863644 !isInlineConstant (Def->getOperand (1 )))
35873645 return false ;
35883646
3589- unsigned NewOpc =
3590- IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3591- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3592- : AMDGPU::V_FMAMK_F16)
3593- : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
3647+ unsigned NewOpc = getNewFMAMKInst (ST, Opc);
35943648 if (pseudoToMCOpcode (NewOpc) == -1 )
35953649 return false ;
35963650
3597- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3598- // would also require restricting their register classes. For now
3599- // just bail out.
3600- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3651+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3652+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3653+ // restricting their register classes. For now just bail out.
3654+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3655+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
36013656 return false ;
36023657
36033658 const std::optional<int64_t > SubRegImm = extractSubregFromImm (
@@ -3613,7 +3668,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36133668 Src0->setIsKill (RegSrc->isKill ());
36143669
36153670 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3616- Opc == AMDGPU::V_FMAC_F32_e64 ||
3671+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36173672 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36183673 UseMI.untieRegOperand (
36193674 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3666,25 +3721,22 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36663721 }
36673722 }
36683723
3669- unsigned NewOpc =
3670- IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3671- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3672- : AMDGPU::V_FMAAK_F16)
3673- : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
3724+ unsigned NewOpc = getNewFMAAKInst (ST, Opc);
36743725 if (pseudoToMCOpcode (NewOpc) == -1 )
36753726 return false ;
36763727
3677- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3678- // would also require restricting their register classes. For now
3679- // just bail out.
3680- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3728+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3729+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3730+ // restricting their register classes. For now just bail out.
3731+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3732+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36813733 return false ;
36823734
36833735 // FIXME: This would be a lot easier if we could return a new instruction
36843736 // instead of having to modify in place.
36853737
36863738 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3687- Opc == AMDGPU::V_FMAC_F32_e64 ||
3739+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36883740 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36893741 UseMI.untieRegOperand (
36903742 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3874,8 +3926,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38743926 return AMDGPU::V_FMA_LEGACY_F32_e64;
38753927 case AMDGPU::V_FMAC_F16_e32:
38763928 case AMDGPU::V_FMAC_F16_e64:
3929+ case AMDGPU::V_FMAC_F16_t16_e64:
38773930 case AMDGPU::V_FMAC_F16_fake16_e64:
3878- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3931+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3932+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3933+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
38793934 : AMDGPU::V_FMA_F16_gfx9_e64;
38803935 case AMDGPU::V_FMAC_F32_e32:
38813936 case AMDGPU::V_FMAC_F32_e64:
@@ -3941,19 +3996,21 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39413996 return MIB;
39423997 }
39433998
3944- assert (
3945- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3946- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3947- " pre-RA" );
3999+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
4000+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
4001+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
4002+ " present pre-RA" );
39484003
39494004 // Handle MAC/FMAC.
39504005 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
39514006 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
4007+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39524008 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39534009 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39544010 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39554011 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39564012 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
4013+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39574014 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39584015 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39594016 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3968,6 +4025,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39684025 return nullptr ;
39694026 case AMDGPU::V_MAC_F16_e64:
39704027 case AMDGPU::V_FMAC_F16_e64:
4028+ case AMDGPU::V_FMAC_F16_t16_e64:
39714029 case AMDGPU::V_FMAC_F16_fake16_e64:
39724030 case AMDGPU::V_MAC_F32_e64:
39734031 case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4052,11 +4110,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40524110
40534111 int64_t Imm;
40544112 if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
4055- unsigned NewOpc =
4056- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4057- : AMDGPU::V_FMAAK_F16)
4058- : AMDGPU::V_FMAAK_F32)
4059- : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
4113+ unsigned NewOpc = getNewFMAAKInst (ST, Opc);
40604114 if (pseudoToMCOpcode (NewOpc) != -1 ) {
40614115 MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
40624116 .add (*Dst)
@@ -4071,11 +4125,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40714125 return MIB;
40724126 }
40734127 }
4074- unsigned NewOpc =
4075- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4076- : AMDGPU::V_FMAMK_F16)
4077- : AMDGPU::V_FMAMK_F32)
4078- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4128+ unsigned NewOpc = getNewFMAMKInst (ST, Opc);
40794129 if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
40804130 if (pseudoToMCOpcode (NewOpc) != -1 ) {
40814131 MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4513,6 +4563,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
45134563 case AMDGPU::V_MAC_F32_e64:
45144564 case AMDGPU::V_MAC_LEGACY_F32_e64:
45154565 case AMDGPU::V_FMAC_F16_e64:
4566+ case AMDGPU::V_FMAC_F16_t16_e64:
45164567 case AMDGPU::V_FMAC_F16_fake16_e64:
45174568 case AMDGPU::V_FMAC_F32_e64:
45184569 case AMDGPU::V_FMAC_F64_e64:
@@ -5569,7 +5620,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55695620 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55705621 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55715622 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5572- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5623+ case AMDGPU::S_FMAC_F16:
5624+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5625+ : AMDGPU::V_FMAC_F16_fake16_e64;
55735626 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55745627 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55755628 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments