@@ -3533,6 +3533,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35333533 Opc == AMDGPU::V_MAD_F16_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
35343534 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35353535 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3536+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35363537 Opc == AMDGPU::V_FMAC_F16_fake16_e64) {
35373538 // Don't fold if we are using source or output modifiers. The new VOP2
35383539 // instructions don't have them.
@@ -3555,6 +3556,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35553556 bool IsFMA =
35563557 Opc == AMDGPU::V_FMA_F32_e64 || Opc == AMDGPU::V_FMAC_F32_e64 ||
35573558 Opc == AMDGPU::V_FMA_F16_e64 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3559+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
35583560 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
35593561 MachineOperand *Src1 = getNamedOperand (UseMI, AMDGPU::OpName::src1);
35603562 MachineOperand *Src2 = getNamedOperand (UseMI, AMDGPU::OpName::src2);
@@ -3588,16 +3590,19 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
35883590
35893591 unsigned NewOpc =
35903592 IsFMA ? (IsF32 ? AMDGPU::V_FMAMK_F32
3591- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
3593+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3594+ ? AMDGPU::V_FMAMK_F16_t16
3595+ : AMDGPU::V_FMAMK_F16_fake16
35923596 : AMDGPU::V_FMAMK_F16)
35933597 : (IsF32 ? AMDGPU::V_MADMK_F32 : AMDGPU::V_MADMK_F16);
35943598 if (pseudoToMCOpcode (NewOpc) == -1 )
35953599 return false ;
35963600
3597- // V_FMAMK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3598- // would also require restricting their register classes. For now
3599- // just bail out.
3600- if (NewOpc == AMDGPU::V_FMAMK_F16_fake16)
3601+ // V_FMAMK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAMK_F16_fake16
3602+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3603+ // restricting their register classes. For now just bail out.
3604+ if (NewOpc == AMDGPU::V_FMAMK_F16_t16 ||
3605+ NewOpc == AMDGPU::V_FMAMK_F16_fake16)
36013606 return false ;
36023607
36033608 const std::optional<int64_t > SubRegImm = extractSubregFromImm (
@@ -3613,7 +3618,7 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36133618 Src0->setIsKill (RegSrc->isKill ());
36143619
36153620 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3616- Opc == AMDGPU::V_FMAC_F32_e64 ||
3621+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36173622 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36183623 UseMI.untieRegOperand (
36193624 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3668,23 +3673,26 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
36683673
36693674 unsigned NewOpc =
36703675 IsFMA ? (IsF32 ? AMDGPU::V_FMAAK_F32
3671- : ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
3676+ : ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3677+ ? AMDGPU::V_FMAAK_F16_t16
3678+ : AMDGPU::V_FMAAK_F16_fake16
36723679 : AMDGPU::V_FMAAK_F16)
36733680 : (IsF32 ? AMDGPU::V_MADAK_F32 : AMDGPU::V_MADAK_F16);
36743681 if (pseudoToMCOpcode (NewOpc) == -1 )
36753682 return false ;
36763683
3677- // V_FMAAK_F16_fake16 takes VGPR_32_Lo128 operands, so the rewrite
3678- // would also require restricting their register classes. For now
3679- // just bail out.
3680- if (NewOpc == AMDGPU::V_FMAAK_F16_fake16)
3684+ // V_FMAAK_F16_t16 takes VGPR_16_Lo128 operands while V_FMAAK_F16_fake16
3685+ // takes VGPR_32_Lo128 operands, so the rewrite would also require
3686+ // restricting their register classes. For now just bail out.
3687+ if (NewOpc == AMDGPU::V_FMAAK_F16_t16 ||
3688+ NewOpc == AMDGPU::V_FMAAK_F16_fake16)
36813689 return false ;
36823690
36833691 // FIXME: This would be a lot easier if we could return a new instruction
36843692 // instead of having to modify in place.
36853693
36863694 if (Opc == AMDGPU::V_MAC_F32_e64 || Opc == AMDGPU::V_MAC_F16_e64 ||
3687- Opc == AMDGPU::V_FMAC_F32_e64 ||
3695+ Opc == AMDGPU::V_FMAC_F32_e64 || Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
36883696 Opc == AMDGPU::V_FMAC_F16_fake16_e64 || Opc == AMDGPU::V_FMAC_F16_e64)
36893697 UseMI.untieRegOperand (
36903698 AMDGPU::getNamedOperandIdx (Opc, AMDGPU::OpName::src2));
@@ -3874,8 +3882,11 @@ static unsigned getNewFMAInst(const GCNSubtarget &ST, unsigned Opc) {
38743882 return AMDGPU::V_FMA_LEGACY_F32_e64;
38753883 case AMDGPU::V_FMAC_F16_e32:
38763884 case AMDGPU::V_FMAC_F16_e64:
3885+ case AMDGPU::V_FMAC_F16_t16_e64:
38773886 case AMDGPU::V_FMAC_F16_fake16_e64:
3878- return ST.hasTrue16BitInsts () ? AMDGPU::V_FMA_F16_gfx9_fake16_e64
3887+ return ST.hasTrue16BitInsts () ? ST.useRealTrue16Insts ()
3888+ ? AMDGPU::V_FMA_F16_gfx9_t16_e64
3889+ : AMDGPU::V_FMA_F16_gfx9_fake16_e64
38793890 : AMDGPU::V_FMA_F16_gfx9_e64;
38803891 case AMDGPU::V_FMAC_F32_e32:
38813892 case AMDGPU::V_FMAC_F32_e64:
@@ -3941,19 +3952,22 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39413952 return MIB;
39423953 }
39433954
3944- assert (
3945- Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3946- " V_FMAC_F16_fake16_e32 is not supported and not expected to be present "
3947- " pre-RA" );
3955+ assert (Opc != AMDGPU::V_FMAC_F16_t16_e32 &&
3956+ Opc != AMDGPU::V_FMAC_F16_fake16_e32 &&
3957+ " V_FMAC_F16_t16/fake16_e32 is not supported and not expected to be "
3958+ " present "
3959+ " pre-RA" );
39483960
39493961 // Handle MAC/FMAC.
39503962 bool IsF16 = Opc == AMDGPU::V_MAC_F16_e32 || Opc == AMDGPU::V_MAC_F16_e64 ||
39513963 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3964+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39523965 Opc == AMDGPU::V_FMAC_F16_fake16_e64;
39533966 bool IsFMA = Opc == AMDGPU::V_FMAC_F32_e32 || Opc == AMDGPU::V_FMAC_F32_e64 ||
39543967 Opc == AMDGPU::V_FMAC_LEGACY_F32_e32 ||
39553968 Opc == AMDGPU::V_FMAC_LEGACY_F32_e64 ||
39563969 Opc == AMDGPU::V_FMAC_F16_e32 || Opc == AMDGPU::V_FMAC_F16_e64 ||
3970+ Opc == AMDGPU::V_FMAC_F16_t16_e64 ||
39573971 Opc == AMDGPU::V_FMAC_F16_fake16_e64 ||
39583972 Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
39593973 bool IsF64 = Opc == AMDGPU::V_FMAC_F64_e32 || Opc == AMDGPU::V_FMAC_F64_e64;
@@ -3968,6 +3982,7 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
39683982 return nullptr ;
39693983 case AMDGPU::V_MAC_F16_e64:
39703984 case AMDGPU::V_FMAC_F16_e64:
3985+ case AMDGPU::V_FMAC_F16_t16_e64:
39713986 case AMDGPU::V_FMAC_F16_fake16_e64:
39723987 case AMDGPU::V_MAC_F32_e64:
39733988 case AMDGPU::V_MAC_LEGACY_F32_e64:
@@ -4053,8 +4068,11 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40534068 int64_t Imm;
40544069 if (!Src0Literal && getFoldableImm (Src2, Imm, &DefMI)) {
40554070 unsigned NewOpc =
4056- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAAK_F16_fake16
4057- : AMDGPU::V_FMAAK_F16)
4071+ IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts ()
4072+ ? ST.useRealTrue16Insts ()
4073+ ? AMDGPU::V_FMAAK_F16_t16
4074+ : AMDGPU::V_FMAAK_F16_fake16
4075+ : AMDGPU::V_FMAAK_F16)
40584076 : AMDGPU::V_FMAAK_F32)
40594077 : (IsF16 ? AMDGPU::V_MADAK_F16 : AMDGPU::V_MADAK_F32);
40604078 if (pseudoToMCOpcode (NewOpc) != -1 ) {
@@ -4071,11 +4089,14 @@ MachineInstr *SIInstrInfo::convertToThreeAddress(MachineInstr &MI,
40714089 return MIB;
40724090 }
40734091 }
4074- unsigned NewOpc =
4075- IsFMA ? (IsF16 ? (ST.hasTrue16BitInsts () ? AMDGPU::V_FMAMK_F16_fake16
4076- : AMDGPU::V_FMAMK_F16)
4077- : AMDGPU::V_FMAMK_F32)
4078- : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
4092+ unsigned NewOpc = IsFMA
4093+ ? (IsF16 ? (ST.hasTrue16BitInsts ()
4094+ ? ST.useRealTrue16Insts ()
4095+ ? AMDGPU::V_FMAMK_F16_t16
4096+ : AMDGPU::V_FMAMK_F16_fake16
4097+ : AMDGPU::V_FMAMK_F16)
4098+ : AMDGPU::V_FMAMK_F32)
4099+ : (IsF16 ? AMDGPU::V_MADMK_F16 : AMDGPU::V_MADMK_F32);
40794100 if (!Src0Literal && getFoldableImm (Src1, Imm, &DefMI)) {
40804101 if (pseudoToMCOpcode (NewOpc) != -1 ) {
40814102 MIB = BuildMI (MBB, MI, MI.getDebugLoc (), get (NewOpc))
@@ -4513,6 +4534,7 @@ bool SIInstrInfo::canShrink(const MachineInstr &MI,
45134534 case AMDGPU::V_MAC_F32_e64:
45144535 case AMDGPU::V_MAC_LEGACY_F32_e64:
45154536 case AMDGPU::V_FMAC_F16_e64:
4537+ case AMDGPU::V_FMAC_F16_t16_e64:
45164538 case AMDGPU::V_FMAC_F16_fake16_e64:
45174539 case AMDGPU::V_FMAC_F32_e64:
45184540 case AMDGPU::V_FMAC_F64_e64:
@@ -5569,7 +5591,9 @@ unsigned SIInstrInfo::getVALUOp(const MachineInstr &MI) const {
55695591 case AMDGPU::S_MUL_F16: return AMDGPU::V_MUL_F16_fake16_e64;
55705592 case AMDGPU::S_CVT_PK_RTZ_F16_F32: return AMDGPU::V_CVT_PKRTZ_F16_F32_e64;
55715593 case AMDGPU::S_FMAC_F32: return AMDGPU::V_FMAC_F32_e64;
5572- case AMDGPU::S_FMAC_F16: return AMDGPU::V_FMAC_F16_fake16_e64;
5594+ case AMDGPU::S_FMAC_F16:
5595+ return ST.useRealTrue16Insts () ? AMDGPU::V_FMAC_F16_t16_e64
5596+ : AMDGPU::V_FMAC_F16_fake16_e64;
55735597 case AMDGPU::S_FMAMK_F32: return AMDGPU::V_FMAMK_F32;
55745598 case AMDGPU::S_FMAAK_F32: return AMDGPU::V_FMAAK_F32;
55755599 case AMDGPU::S_CMP_LT_F32: return AMDGPU::V_CMP_LT_F32_e64;
0 commit comments