@@ -24,6 +24,7 @@ def VOP_F32_F32_F32_F32_VCC : VOPProfile<[f32, f32, f32, f32]> {
2424}
2525def VOP_F64_F64_F64_F64_VCC : VOPProfile<[f64, f64, f64, f64]> {
2626 let Outs64 = (outs DstRC.RegClass:$vdst);
27+ let HasExt64BitDPP = 1;
2728 let IsSingle = 1;
2829}
2930}
@@ -51,7 +52,24 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
5152
5253let HasExt64BitDPP = 1 in {
5354def VOP3b_F32_I1_F32_F32_F32 : VOP3b_Profile<f32>;
54- def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64>;
55+ def VOP3b_F64_I1_F64_F64_F64 : VOP3b_Profile<f64> {
56+ let OutsVOP3DPP = Outs64;
57+ let AsmVOP3DPP = getAsmVOP3DPP<Asm64>.ret;
58+ let AsmVOP3DPP16 = getAsmVOP3DPP16<Asm64>.ret;
59+ let AsmVOP3DPP8 = getAsmVOP3DPP8<Asm64>.ret;
60+ }
61+
62+ def VOP3b_I64_I1_I32_I32_I64_DPP : VOPProfile<[i64, i32, i32, i64]> {
63+ let HasClamp = 1;
64+
65+ let IsSingle = 1;
66+ let Outs64 = (outs DstRC:$vdst, VOPDstS64orS32:$sdst);
67+ let OutsVOP3DPP = Outs64;
68+ let Asm64 = "$vdst, $sdst, $src0, $src1, $src2$clamp";
69+ let AsmVOP3DPP = getAsmVOP3DPP<Asm64>.ret;
70+ let AsmVOP3DPP16 = getAsmVOP3DPP16<Asm64>.ret;
71+ let AsmVOP3DPP8 = getAsmVOP3DPP8<Asm64>.ret;
72+ }
5573
5674class V_MUL_PROF<VOPProfile P> : VOP3_Profile<P> {
5775 let HasExtVOP3DPP = 0;
@@ -229,7 +247,7 @@ defm V_DIV_FMAS_F32 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f32", VOP_F32_F32_F32
229247// result *= 2^64
230248//
231249let SchedRW = [WriteDouble], FPDPRounding = 1 in
232- defm V_DIV_FMAS_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, [] >;
250+ defm V_DIV_FMAS_F64 : VOP3Inst <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC>;
233251} // End Uses = [MODE, VCC, EXEC]
234252
235253} // End isCommutable = 1
@@ -294,7 +312,7 @@ defm V_CVT_PK_U8_F32 : VOP3Inst<"v_cvt_pk_u8_f32", VOP3_Profile<VOP_I32_F32_I32_
294312defm V_DIV_FIXUP_F32 : VOP3Inst <"v_div_fixup_f32", DIV_FIXUP_F32_PROF, AMDGPUdiv_fixup>;
295313
296314let SchedRW = [WriteDoubleAdd], FPDPRounding = 1 in {
297- defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_F64> , AMDGPUdiv_fixup>;
315+ defm V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP_F64_F64_F64_F64_DPP_PROF , AMDGPUdiv_fixup>;
298316 defm V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, any_fldexp>;
299317} // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
300318} // End isReMaterializable = 1
@@ -335,7 +353,7 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
335353
336354 // Double precision division pre-scale.
337355 let SchedRW = [WriteDouble, WriteSALU], FPDPRounding = 1 in
338- defm V_DIV_SCALE_F64 : VOP3Inst_Pseudo_Wrapper <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>;
356+ defm V_DIV_SCALE_F64 : VOP3Inst <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64>;
339357} // End mayRaiseFPException = 0
340358
341359let isReMaterializable = 1 in
@@ -408,9 +426,9 @@ defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
408426} // End SubtargetPredicate = isGFX7Plus
409427
410428let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
411- let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in {
412- defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64 >;
413- defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64 >;
429+ let SubtargetPredicate = isGFX7Plus in {
430+ defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug] >;
431+ defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64_DPP, null_frag, [HasNotMADIntraFwdBug] >;
414432 }
415433 let SubtargetPredicate = isGFX11Only, OtherPredicates = [HasMADIntraFwdBug],
416434 Constraints = "@earlyclobber $vdst" in {
@@ -2054,8 +2072,8 @@ defm V_S_SQRT_F32 : VOP3Only_Real_Base_gfx12<0x288>;
20542072defm V_S_SQRT_F16 : VOP3Only_Real_Base_gfx12<0x289>;
20552073defm V_MAD_CO_U64_U32 : VOP3be_Real_with_name_gfx12<0x2fe, "V_MAD_U64_U32", "v_mad_co_u64_u32">;
20562074defm V_MAD_CO_I64_I32 : VOP3be_Real_with_name_gfx12<0x2ff, "V_MAD_I64_I32", "v_mad_co_i64_i32">;
2057- defm V_MINIMUM_F64 : VOP3Only_Real_Base_gfx12 <0x341>;
2058- defm V_MAXIMUM_F64 : VOP3Only_Real_Base_gfx12 <0x342>;
2075+ defm V_MINIMUM_F64 : VOP3Only_Realtriple_gfx11_gfx12 <0x341>;
2076+ defm V_MAXIMUM_F64 : VOP3Only_Realtriple_gfx11_gfx12 <0x342>;
20592077defm V_MINIMUM_F32 : VOP3Only_Realtriple_gfx12<0x365>;
20602078defm V_MAXIMUM_F32 : VOP3Only_Realtriple_gfx12<0x366>;
20612079defm V_MINIMUM_F16 : VOP3Only_Realtriple_t16_and_fake16_gfx12<0x367, "v_minimum_f16">;
@@ -2127,6 +2145,13 @@ multiclass VOP3be_Real_gfx11_gfx12<bits<10> op, string opName, string asmName> :
21272145 VOP3be_Real<GFX11Gen, op, opName, asmName>,
21282146 VOP3be_Real<GFX12Gen, op, opName, asmName>;
21292147
2148+ multiclass VOP3be_Real_gfx11_gfx12_not_gfx1250<bits<10> op, string opName, string asmName> :
2149+ VOP3be_Real<GFX11Gen, op, opName, asmName>,
2150+ VOP3be_Real<GFX12Not12_50Gen, op, opName, asmName>;
2151+
2152+ multiclass VOP3be_Realtriple_gfx1250<bits<10> op> :
2153+ VOP3be_Realtriple<GFX1250Gen, op>;
2154+
21302155multiclass VOP3_Real_No_Suffix_gfx11_gfx12<bits<10> op> :
21312156 VOP3_Real_No_Suffix<GFX11Gen, op>, VOP3_Real_No_Suffix<GFX12Gen, op>;
21322157
@@ -2141,7 +2166,7 @@ defm V_BFE_U32 : VOP3_Realtriple_gfx11_gfx12<0x210>;
21412166defm V_BFE_I32 : VOP3_Realtriple_gfx11_gfx12<0x211>;
21422167defm V_BFI_B32 : VOP3_Realtriple_gfx11_gfx12<0x212>;
21432168defm V_FMA_F32 : VOP3_Realtriple_gfx11_gfx12<0x213>;
2144- defm V_FMA_F64 : VOP3_Real_Base_gfx11_gfx12 <0x214>;
2169+ defm V_FMA_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250 <0x214>;
21452170defm V_LERP_U8 : VOP3_Realtriple_gfx11_gfx12<0x215>;
21462171defm V_ALIGNBIT_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x216, "v_alignbit_b32">;
21472172defm V_ALIGNBYTE_B32 : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x217, "v_alignbyte_b32">;
@@ -2161,9 +2186,9 @@ defm V_SAD_U16 : VOP3_Realtriple_gfx11_gfx12<0x224>;
21612186defm V_SAD_U32 : VOP3_Realtriple_gfx11_gfx12<0x225>;
21622187defm V_CVT_PK_U8_F32 : VOP3_Realtriple_gfx11_gfx12<0x226>;
21632188defm V_DIV_FIXUP_F32 : VOP3_Real_Base_gfx11_gfx12<0x227>;
2164- defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11_gfx12 <0x228>;
2189+ defm V_DIV_FIXUP_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250 <0x228>;
21652190defm V_DIV_FMAS_F32 : VOP3_Real_Base_gfx11_gfx12<0x237>;
2166- defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11_gfx12 <0x238>;
2191+ defm V_DIV_FMAS_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250 <0x238>;
21672192defm V_MSAD_U8 : VOP3_Realtriple_gfx11_gfx12<0x239>;
21682193defm V_QSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23a>;
21692194defm V_MQSAD_PK_U16_U8 : VOP3_Real_Base_gfx11_gfx12<0x23b>;
@@ -2205,7 +2230,7 @@ defm V_MINMAX_I32 : VOP3_Realtriple_gfx11_gfx12<0x265>;
22052230defm V_DOT2_F16_F16 : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x266, "v_dot2_f16_f16">;
22062231defm V_DOT2_BF16_BF16 : VOP3Dot_Realtriple_t16_and_fake16_gfx11_gfx12<0x267, "v_dot2_bf16_bf16">;
22072232defm V_DIV_SCALE_F32 : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "v_div_scale_f32">;
2208- defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12 <0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
2233+ defm V_DIV_SCALE_F64 : VOP3be_Real_gfx11_gfx12_not_gfx1250 <0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
22092234defm V_MAD_U64_U32_gfx11 : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
22102235defm V_MAD_I64_I32_gfx11 : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
22112236defm V_ADD_NC_U16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x303, "v_add_nc_u16">;
@@ -2228,7 +2253,7 @@ defm V_ADD_F64 : VOP3_Real_Base_gfx11<0x327>;
22282253defm V_MUL_F64 : VOP3_Real_Base_gfx11<0x328>;
22292254defm V_MIN_F64 : VOP3_Real_Base_gfx11<0x329>;
22302255defm V_MAX_F64 : VOP3_Real_Base_gfx11<0x32a>;
2231- defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12 <0x32b>;
2256+ defm V_LDEXP_F64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250 <0x32b>;
22322257defm V_MUL_LO_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32c>;
22332258defm V_MUL_HI_U32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32d>;
22342259defm V_MUL_HI_I32 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250<0x32e>;
@@ -2237,8 +2262,8 @@ defm V_LSHLREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x33
22372262defm V_LSHRREV_B16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x339, "v_lshrrev_b16">;
22382263defm V_ASHRREV_I16 : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x33a, "v_ashrrev_i16">;
22392264defm V_LSHLREV_B64 : VOP3_Real_Base_gfx11<0x33c>;
2240- defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11_gfx12 <0x33d>;
2241- defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11_gfx12 <0x33e>;
2265+ defm V_LSHRREV_B64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250 <0x33d>;
2266+ defm V_ASHRREV_I64 : VOP3_Real_Base_gfx11_gfx12_not_gfx1250 <0x33e>;
22422267defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x360>; // Pseudo in VOP2
22432268let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
22442269 defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx11_gfx12<0x361>; // Pseudo in VOP2
@@ -2260,9 +2285,16 @@ let AssemblerPredicate = isGFX11Plus in {
22602285}
22612286
22622287// These instructions differ from GFX12 variant by supporting DPP:
2288+ defm V_FMA_F64 : VOP3Only_Realtriple_gfx1250<0x214>;
2289+ defm V_DIV_FIXUP_F64 : VOP3Only_Realtriple_gfx1250<0x228>;
2290+ defm V_DIV_FMAS_F64 : VOP3Only_Realtriple_gfx1250<0x238>;
2291+ defm V_DIV_SCALE_F64 : VOP3be_Realtriple_gfx1250<0x2fd>;
2292+ defm V_LDEXP_F64 : VOP3Only_Realtriple_gfx1250<0x32b>;
22632293defm V_MUL_LO_U32 : VOP3Only_Realtriple_gfx1250<0x32c>;
22642294defm V_MUL_HI_U32 : VOP3Only_Realtriple_gfx1250<0x32d>;
22652295defm V_MUL_HI_I32 : VOP3Only_Realtriple_gfx1250<0x32e>;
2296+ defm V_LSHRREV_B64 : VOP3Only_Realtriple_gfx1250<0x33d>;
2297+ defm V_ASHRREV_I64 : VOP3Only_Realtriple_gfx1250<0x33e>;
22662298
22672299defm V_PERM_PK16_B4_U4 : VOP3Only_Real_Base_gfx1250<0x23f>;
22682300defm V_PERM_PK16_B6_U4 : VOP3Only_Real_Base_gfx1250<0x242>;
0 commit comments