@@ -64,6 +64,13 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
6464 "$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
6565}
6666
67+ class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR>
68+ : VOP3P_Mix_Profile<P, Features, 0> {
69+ let IsTrue16 = 1;
70+ let IsRealTrue16 = 1;
71+ let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
72+ }
73+
6774multiclass VOP3PInst<string OpName, VOPProfile P,
6875 SDPatternOperator node = null_frag, bit IsDOT = 0> {
6976 def NAME : VOP3P_Pseudo<OpName, P,
@@ -95,6 +102,16 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
95102 } // end SubtargetPredicate = isGFX11Plus
96103}
97104
105+ multiclass VOP3_VOP3PInst_t16<string OpName, VOP3P_Mix_Profile P> {
106+ def NAME : VOP3P_Pseudo<OpName, P>;
107+
108+ if P.HasExtVOP3DPP then
109+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
110+ let VOP3P = 1;
111+ let PseudoInstr = OpName#"_dpp";
112+ }
113+ }
114+
98115let isReMaterializable = 1 in {
99116let isCommutable = 1 in {
100117defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
@@ -160,12 +177,9 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
160177
161178// TODO: Make sure we're doing the right thing with denormals. Note
162179// that FMA and MAD will differ.
163- multiclass MadFmaMixPats<SDPatternOperator fma_like,
164- Instruction mix_inst,
165- Instruction mixlo_inst,
166- Instruction mixhi_inst,
167- ValueType VT = f16,
168- ValueType vecVT = v2f16> {
180+ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
181+ Instruction mix_inst,
182+ ValueType VT = f16> {
169183 defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
170184 defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
171185 // At least one of the operands needs to be an fpextend of an f16
@@ -189,7 +203,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
189203 (f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
190204 (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
191205 DSTCLAMP.NONE)>;
206+ }
192207
208+ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
209+ Instruction mixlo_inst,
210+ Instruction mixhi_inst,
211+ ValueType VT = f16,
212+ ValueType vecVT = v2f16> {
213+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
193214 def : GCNPat <
194215 (AMDGPUclamp (build_vector
195216 (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
@@ -243,9 +264,6 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
243264 // FIXME: Special case handling for maxhi (especially for clamp)
244265 // because dealing with the write to high half of the register is
245266 // difficult.
246- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
247- let True16Predicate = p in {
248-
249267 def : GCNPat <
250268 (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
251269 (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
@@ -269,45 +287,60 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
269287 DSTCLAMP.ENABLE,
270288 VGPR_32:$elt0))
271289 >;
290+ }
272291
273- } // end True16Predicate
292+ multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like,
293+ Instruction mix_inst_16,
294+ ValueType VT = f16,
295+ ValueType vecVT = v2f16> {
296+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
297+ def : GCNPat <
298+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
299+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
300+ (mix_inst_16 $src0_modifiers, $src0,
301+ $src1_modifiers, $src1,
302+ (i32 0), (i32 0),
303+ DSTCLAMP.NONE)
304+ >;
274305
275- let True16Predicate = UseRealTrue16Insts in {
276306 def : GCNPat <
277- (build_vector ( VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
307+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
278308 (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
279- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
280- (vecVT (mixlo_inst $src0_modifiers, $src0,
281- $src1_modifiers, $src1,
282- $src2_modifiers, $src2,
283- DSTCLAMP.NONE,
284- (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
309+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
310+ (mix_inst_16 $src0_modifiers, $src0,
311+ $src1_modifiers, $src1,
312+ $src2_modifiers, $src2,
313+ DSTCLAMP.NONE)
285314 >;
286315
316+
287317 def : GCNPat <
288- (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
289- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
290- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
291- (vecVT (mixhi_inst $src0_modifiers, $src0,
292- $src1_modifiers, $src1,
293- $src2_modifiers, $src2,
294- DSTCLAMP.NONE,
295- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
318+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
319+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
320+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
321+ (mix_inst_16 $src0_modifiers, $src0,
322+ $src1_modifiers, $src1,
323+ $src2_modifiers, $src2,
324+ DSTCLAMP.ENABLE)
296325 >;
297326
298327 def : GCNPat <
299- (build_vector
300- VT:$elt0,
301- (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
302- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
303- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
304- (vecVT (mixhi_inst $src0_modifiers, $src0,
305- $src1_modifiers, $src1,
306- $src2_modifiers, $src2,
307- DSTCLAMP.ENABLE,
308- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
328+ (AMDGPUclamp (build_vector
329+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
330+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
331+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
332+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
333+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
334+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
335+ (vecVT (REG_SEQUENCE VGPR_32, (mix_inst_16 $lo_src0_modifiers, $lo_src0,
336+ $lo_src1_modifiers, $lo_src1,
337+ $lo_src2_modifiers, $lo_src2,
338+ DSTCLAMP.ENABLE), lo16,
339+ (mix_inst_16 $hi_src0_modifiers, $hi_src0,
340+ $hi_src1_modifiers, $hi_src1,
341+ $hi_src2_modifiers, $hi_src2,
342+ DSTCLAMP.ENABLE), hi16))
309343 >;
310- } // end True16Predicate
311344}
312345
313346class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -341,7 +374,8 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
341374} // End FPDPRounding = 1
342375}
343376
344- defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
377+ defm : MadFmaMixFP32Pats<fmad, V_MAD_MIX_F32>;
378+ defm : MadFmaMixFP16Pats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
345379} // OtherPredicates = [NoFP32Denormals]
346380} // End SubtargetPredicate = HasMadMixInsts
347381
@@ -360,10 +394,19 @@ defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F
360394let ClampLo = 0, ClampHi = 1 in {
361395defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
362396}
397+
398+ // Pseudo true16 inst for v_fma_mixlo/hi_f16
399+ defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
363400} // End FPDPRounding = 1
364401}
365402
366- defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
403+ defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32>;
404+
405+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
406+ let True16Predicate = p in
407+ defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
408+ let True16Predicate = UseRealTrue16Insts in
409+ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_F16_t16>;
367410}
368411
369412let SubtargetPredicate = HasFmaMixBF16Insts in {
@@ -378,10 +421,18 @@ defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP
378421let ClampLo = 0, ClampHi = 1 in {
379422defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
380423}
424+
425+ // Pseudo true16 inst for v_fma_mixlo/hi_bf16
426+ defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
381427} // End FPDPRounding = 1
382428} // End isCommutable = 1
383429
384- defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
430+ defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32_BF16, bf16>;
431+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
432+ let True16Predicate = p in
433+ defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
434+ let True16Predicate = UseRealTrue16Insts in
435+ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
385436} // End SubtargetPredicate = HasFmaMixBF16Insts
386437
387438def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
0 commit comments