@@ -154,10 +154,12 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
154154multiclass MadFmaMixPats<SDPatternOperator fma_like,
155155 Instruction mix_inst,
156156 Instruction mixlo_inst,
157- Instruction mixhi_inst> {
157+ Instruction mixhi_inst,
158+ bit HasFP32Denormals> {
158159 // At least one of the operands needs to be an fpextend of an f16
159160 // for this to be worthwhile, so we need three patterns here.
160161 // TODO: Could we use a predicate to inspect src1/2/3 instead?
162+ let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]) in {
161163 def : GCNPat <
162164 (f32 (fma_like (f32 (VOP3PMadMixModsExt f16:$src0, i32:$src0_mods)),
163165 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_mods)),
@@ -177,6 +179,45 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
177179 (mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
178180 DSTCLAMP.NONE)>;
179181
182+ def : GCNPat <
183+ (AMDGPUclamp (build_vector
184+ (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
185+ (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
186+ (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
187+ (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
188+ (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
189+ (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
190+ (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
191+ $hi_src1_modifiers, $hi_src1,
192+ $hi_src2_modifiers, $hi_src2,
193+ DSTCLAMP.ENABLE,
194+ (mixlo_inst $lo_src0_modifiers, $lo_src0,
195+ $lo_src1_modifiers, $lo_src1,
196+ $lo_src2_modifiers, $lo_src2,
197+ DSTCLAMP.ENABLE,
198+ (i32 (IMPLICIT_DEF)))))
199+ >;
200+
201+ def : GCNPat <
202+ (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
203+ (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
204+ (mixlo_inst $src0_modifiers, $src0,
205+ $src1_modifiers, $src1,
206+ (i32 0), (i32 0),
207+ DSTCLAMP.NONE,
208+ (i32 (IMPLICIT_DEF)))
209+ >;
210+
211+ def : GCNPat <
212+ (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
213+ (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
214+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
215+ $src1_modifiers, $src1,
216+ (i32 0), (i32 0),
217+ DSTCLAMP.NONE,
218+ VGPR_32:$elt0))
219+ >;
220+
180221 def : GCNPat <
181222 (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
182223 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
@@ -187,10 +228,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
187228 DSTCLAMP.NONE,
188229 (i32 (IMPLICIT_DEF)))
189230 >;
231+ } // End OtherPredicates
190232
191233 // FIXME: Special case handling for maxhi (especially for clamp)
192234 // because dealing with the write to high half of the register is
193235 // difficult.
236+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
237+ let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = p in {
238+
194239 def : GCNPat <
195240 (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
196241 (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
@@ -215,44 +260,44 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
215260 VGPR_32:$elt0))
216261 >;
217262
218- def : GCNPat <
219- (AMDGPUclamp (build_vector
220- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$lo_src0, i32:$lo_src0_modifiers)),
221- (f32 (VOP3PMadMixMods f16:$lo_src1, i32:$lo_src1_modifiers)),
222- (f32 (VOP3PMadMixMods f16:$lo_src2, i32:$lo_src2_modifiers))))),
223- (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$hi_src0, i32:$hi_src0_modifiers)),
224- (f32 (VOP3PMadMixMods f16:$hi_src1, i32:$hi_src1_modifiers)),
225- (f32 (VOP3PMadMixMods f16:$hi_src2, i32:$hi_src2_modifiers))))))),
226- (v2f16 (mixhi_inst $hi_src0_modifiers, $hi_src0,
227- $hi_src1_modifiers, $hi_src1,
228- $hi_src2_modifiers, $hi_src2,
229- DSTCLAMP.ENABLE,
230- (mixlo_inst $lo_src0_modifiers, $lo_src0,
231- $lo_src1_modifiers, $lo_src1,
232- $lo_src2_modifiers, $lo_src2,
233- DSTCLAMP.ENABLE,
234- (i32 (IMPLICIT_DEF)))))
235- >;
263+ } // end OtherPredicates
236264
265+ let OtherPredicates = !if(HasFP32Denormals, [TruePredicate], [NoFP32Denormals]), True16Predicate = UseRealTrue16Insts in {
237266 def : GCNPat <
238- (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
239- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers))))),
240- (mixlo_inst $src0_modifiers, $src0,
267+ (build_vector (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
268+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
269+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))), f16:$elt1),
270+ (v2f16 (mixlo_inst $src0_modifiers, $src0,
241271 $src1_modifiers, $src1,
242- (i32 0), (i32 0) ,
272+ $src2_modifiers, $src2 ,
243273 DSTCLAMP.NONE,
244- (i32 ( IMPLICIT_DEF)))
274+ (REG_SEQUENCE VGPR_32, (f16 ( IMPLICIT_DEF)), lo16, $elt1, hi16 )))
245275 >;
246276
247277 def : GCNPat <
248- (build_vector f16:$elt0, (f16 (fpround (fmul (f32 (VOP3PMadMixMods f32:$src0, i32:$src0_modifiers)),
249- (f32 (VOP3PMadMixMods f32:$src1, i32:$src1_modifiers)))))),
278+ (build_vector f16:$elt0, (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
279+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
280+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers)))))),
250281 (v2f16 (mixhi_inst $src0_modifiers, $src0,
251282 $src1_modifiers, $src1,
252- (i32 0), (i32 0) ,
283+ $src2_modifiers, $src2 ,
253284 DSTCLAMP.NONE,
254- VGPR_32:$elt0))
285+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
286+ >;
287+
288+ def : GCNPat <
289+ (build_vector
290+ f16:$elt0,
291+ (AMDGPUclamp (f16 (fpround (fma_like (f32 (VOP3PMadMixMods f16:$src0, i32:$src0_modifiers)),
292+ (f32 (VOP3PMadMixMods f16:$src1, i32:$src1_modifiers)),
293+ (f32 (VOP3PMadMixMods f16:$src2, i32:$src2_modifiers))))))),
294+ (v2f16 (mixhi_inst $src0_modifiers, $src0,
295+ $src1_modifiers, $src1,
296+ $src2_modifiers, $src2,
297+ DSTCLAMP.ENABLE,
298+ (REG_SEQUENCE VGPR_32, $elt0, lo16, (f16 (IMPLICIT_DEF)), hi16)))
255299 >;
300+ } // end OtherPredicates
256301}
257302
258303class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -266,7 +311,8 @@ def : MinimumMaximumByMinimum3Maximum3VOP3P<fminimum, V_PK_MINIMUM3_F16>;
266311def : MinimumMaximumByMinimum3Maximum3VOP3P<fmaximum, V_PK_MAXIMUM3_F16>;
267312}
268313
269- let SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals] in {
314+ let SubtargetPredicate = HasMadMixInsts in {
315+ let OtherPredicates = [NoFP32Denormals] in {
270316
271317// These are VOP3a-like opcodes which accept no omod.
272318// Size of src arguments (16/32) is controlled by op_sel.
@@ -284,9 +330,10 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
284330}
285331} // End FPDPRounding = 1
286332}
333+ } // OtherPredicates = [NoFP32Denormals]
287334
288- defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
289- } // End SubtargetPredicate = HasMadMixInsts, OtherPredicates = [NoFP32Denormals]
335+ defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16, 0 /*HasFP32Denormals*/ >;
336+ } // End SubtargetPredicate = HasMadMixInsts
290337
291338
292339// Essentially the same as the mad_mix versions
@@ -306,7 +353,7 @@ defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F
306353} // End FPDPRounding = 1
307354}
308355
309- defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
356+ defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16, 1 /*HasPF32Denormals*/ >;
310357}
311358
312359// Defines patterns that extract signed 4bit from each Idx[0].
0 commit comments