@@ -64,6 +64,13 @@ class VOP3P_Mix_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR,
64
64
"$vdst, $src0_modifiers, $src1_modifiers, $src2_modifiers$op_sel$op_sel_hi$clamp";
65
65
}
66
66
67
+ class VOP3P_Mix_Profile_t16<VOPProfile P, VOP3Features Features = VOP3_REGULAR>
68
+ : VOP3P_Mix_Profile<P, Features, 0> {
69
+ let IsTrue16 = 1;
70
+ let IsRealTrue16 = 1;
71
+ let DstRC64 = getVALUDstForVT<P.DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
72
+ }
73
+
67
74
multiclass VOP3PInst<string OpName, VOPProfile P,
68
75
SDPatternOperator node = null_frag, bit IsDOT = 0> {
69
76
def NAME : VOP3P_Pseudo<OpName, P,
@@ -95,6 +102,16 @@ multiclass VOP3_VOP3PInst<string OpName, VOP3P_Mix_Profile P> {
95
102
} // end SubtargetPredicate = isGFX11Plus
96
103
}
97
104
105
+ multiclass VOP3_VOP3PInst_t16<string OpName, VOP3P_Mix_Profile P> {
106
+ def NAME : VOP3P_Pseudo<OpName, P>;
107
+
108
+ if P.HasExtVOP3DPP then
109
+ def _dpp : VOP3_DPP_Pseudo<OpName, P> {
110
+ let VOP3P = 1;
111
+ let PseudoInstr = OpName#"_dpp";
112
+ }
113
+ }
114
+
98
115
let isReMaterializable = 1 in {
99
116
let isCommutable = 1 in {
100
117
defm V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
@@ -160,12 +177,9 @@ defm V_PK_MAXIMUM3_F16 : VOP3PInst<"v_pk_maximum3_f16", VOP3P_Profile<VOP_V2F16_
160
177
161
178
// TODO: Make sure we're doing the right thing with denormals. Note
162
179
// that FMA and MAD will differ.
163
- multiclass MadFmaMixPats<SDPatternOperator fma_like,
164
- Instruction mix_inst,
165
- Instruction mixlo_inst,
166
- Instruction mixhi_inst,
167
- ValueType VT = f16,
168
- ValueType vecVT = v2f16> {
180
+ multiclass MadFmaMixFP32Pats<SDPatternOperator fma_like,
181
+ Instruction mix_inst,
182
+ ValueType VT = f16> {
169
183
defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
170
184
defvar VOP3PMadMixModsExtPat = !if (!eq(VT, bf16), VOP3PMadMixBF16ModsExt, VOP3PMadMixModsExt);
171
185
// At least one of the operands needs to be an fpextend of an f16
@@ -189,7 +203,14 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
189
203
(f32 (VOP3PMadMixModsExtPat VT:$src2, i32:$src2_mods)))),
190
204
(mix_inst $src0_mods, $src0, $src1_mods, $src1, $src2_mods, $src2,
191
205
DSTCLAMP.NONE)>;
206
+ }
192
207
208
+ multiclass MadFmaMixFP16Pats<SDPatternOperator fma_like,
209
+ Instruction mixlo_inst,
210
+ Instruction mixhi_inst,
211
+ ValueType VT = f16,
212
+ ValueType vecVT = v2f16> {
213
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
193
214
def : GCNPat <
194
215
(AMDGPUclamp (build_vector
195
216
(VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
@@ -243,9 +264,6 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
243
264
// FIXME: Special case handling for maxhi (especially for clamp)
244
265
// because dealing with the write to high half of the register is
245
266
// difficult.
246
- foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
247
- let True16Predicate = p in {
248
-
249
267
def : GCNPat <
250
268
(build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
251
269
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
@@ -269,45 +287,60 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
269
287
DSTCLAMP.ENABLE,
270
288
VGPR_32:$elt0))
271
289
>;
290
+ }
272
291
273
- } // end True16Predicate
292
+ multiclass MadFmaMixFP16Pats_t16<SDPatternOperator fma_like,
293
+ Instruction mix_inst_16,
294
+ ValueType VT = f16,
295
+ ValueType vecVT = v2f16> {
296
+ defvar VOP3PMadMixModsPat = !if (!eq(VT, bf16), VOP3PMadMixBF16Mods, VOP3PMadMixMods);
297
+ def : GCNPat <
298
+ (VT (fpround (fmul (f32 (VOP3PMadMixModsPat f32:$src0, i32:$src0_modifiers)),
299
+ (f32 (VOP3PMadMixModsPat f32:$src1, i32:$src1_modifiers))))),
300
+ (mix_inst_16 $src0_modifiers, $src0,
301
+ $src1_modifiers, $src1,
302
+ (i32 0), (i32 0),
303
+ DSTCLAMP.NONE)
304
+ >;
274
305
275
- let True16Predicate = UseRealTrue16Insts in {
276
306
def : GCNPat <
277
- (build_vector ( VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
307
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
278
308
(f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
279
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))), VT:$elt1),
280
- (vecVT (mixlo_inst $src0_modifiers, $src0,
281
- $src1_modifiers, $src1,
282
- $src2_modifiers, $src2,
283
- DSTCLAMP.NONE,
284
- (REG_SEQUENCE VGPR_32, (VT (IMPLICIT_DEF)), lo16, $elt1, hi16)))
309
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))),
310
+ (mix_inst_16 $src0_modifiers, $src0,
311
+ $src1_modifiers, $src1,
312
+ $src2_modifiers, $src2,
313
+ DSTCLAMP.NONE)
285
314
>;
286
315
316
+
287
317
def : GCNPat <
288
- (build_vector VT:$elt0, (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
289
- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
290
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
291
- (vecVT (mixhi_inst $src0_modifiers, $src0,
292
- $src1_modifiers, $src1,
293
- $src2_modifiers, $src2,
294
- DSTCLAMP.NONE,
295
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
318
+ (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
319
+ (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
320
+ (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers)))))),
321
+ (mix_inst_16 $src0_modifiers, $src0,
322
+ $src1_modifiers, $src1,
323
+ $src2_modifiers, $src2,
324
+ DSTCLAMP.ENABLE)
296
325
>;
297
326
298
327
def : GCNPat <
299
- (build_vector
300
- VT:$elt0,
301
- (AMDGPUclamp (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$src0, i32:$src0_modifiers)),
302
- (f32 (VOP3PMadMixModsPat VT:$src1, i32:$src1_modifiers)),
303
- (f32 (VOP3PMadMixModsPat VT:$src2, i32:$src2_modifiers))))))),
304
- (vecVT (mixhi_inst $src0_modifiers, $src0,
305
- $src1_modifiers, $src1,
306
- $src2_modifiers, $src2,
307
- DSTCLAMP.ENABLE,
308
- (REG_SEQUENCE VGPR_32, $elt0, lo16, (VT (IMPLICIT_DEF)), hi16)))
328
+ (AMDGPUclamp (build_vector
329
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$lo_src0, i32:$lo_src0_modifiers)),
330
+ (f32 (VOP3PMadMixModsPat VT:$lo_src1, i32:$lo_src1_modifiers)),
331
+ (f32 (VOP3PMadMixModsPat VT:$lo_src2, i32:$lo_src2_modifiers))))),
332
+ (VT (fpround (fma_like (f32 (VOP3PMadMixModsPat VT:$hi_src0, i32:$hi_src0_modifiers)),
333
+ (f32 (VOP3PMadMixModsPat VT:$hi_src1, i32:$hi_src1_modifiers)),
334
+ (f32 (VOP3PMadMixModsPat VT:$hi_src2, i32:$hi_src2_modifiers))))))),
335
+ (vecVT (REG_SEQUENCE VGPR_32, (mix_inst_16 $lo_src0_modifiers, $lo_src0,
336
+ $lo_src1_modifiers, $lo_src1,
337
+ $lo_src2_modifiers, $lo_src2,
338
+ DSTCLAMP.ENABLE), lo16,
339
+ (mix_inst_16 $hi_src0_modifiers, $hi_src0,
340
+ $hi_src1_modifiers, $hi_src1,
341
+ $hi_src2_modifiers, $hi_src2,
342
+ DSTCLAMP.ENABLE), hi16))
309
343
>;
310
- } // end True16Predicate
311
344
}
312
345
313
346
class MinimumMaximumByMinimum3Maximum3VOP3P<SDPatternOperator node,
@@ -341,7 +374,8 @@ defm V_MAD_MIXHI_F16 : VOP3_VOP3PInst<"v_mad_mixhi_f16", VOP3P_Mix_Profile<VOP_F
341
374
} // End FPDPRounding = 1
342
375
}
343
376
344
- defm : MadFmaMixPats<fmad, V_MAD_MIX_F32, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
377
+ defm : MadFmaMixFP32Pats<fmad, V_MAD_MIX_F32>;
378
+ defm : MadFmaMixFP16Pats<fmad, V_MAD_MIXLO_F16, V_MAD_MIXHI_F16>;
345
379
} // OtherPredicates = [NoFP32Denormals]
346
380
} // End SubtargetPredicate = HasMadMixInsts
347
381
@@ -360,10 +394,19 @@ defm V_FMA_MIXLO_F16 : VOP3_VOP3PInst<"v_fma_mixlo_f16", VOP3P_Mix_Profile<VOP_F
360
394
let ClampLo = 0, ClampHi = 1 in {
361
395
defm V_FMA_MIXHI_F16 : VOP3_VOP3PInst<"v_fma_mixhi_f16", VOP3P_Mix_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL, 1>>;
362
396
}
397
+
398
+ // Pseudo true16 inst for v_fma_mixlo/hi_f16
399
+ defm V_FMA_MIX_F16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_f16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
363
400
} // End FPDPRounding = 1
364
401
}
365
402
366
- defm : MadFmaMixPats<fma, V_FMA_MIX_F32, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
403
+ defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32>;
404
+
405
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
406
+ let True16Predicate = p in
407
+ defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_F16, V_FMA_MIXHI_F16>;
408
+ let True16Predicate = UseRealTrue16Insts in
409
+ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_F16_t16>;
367
410
}
368
411
369
412
let SubtargetPredicate = HasFmaMixBF16Insts in {
@@ -378,10 +421,18 @@ defm V_FMA_MIXLO_BF16 : VOP3_VOP3PInst<"v_fma_mixlo_bf16", VOP3P_Mix_Profile<VOP
378
421
let ClampLo = 0, ClampHi = 1 in {
379
422
defm V_FMA_MIXHI_BF16 : VOP3_VOP3PInst<"v_fma_mixhi_bf16", VOP3P_Mix_Profile<VOP_BF16_BF16_BF16_BF16, VOP3_OPSEL, 1>>;
380
423
}
424
+
425
+ // Pseudo true16 inst for v_fma_mixlo/hi_bf16
426
+ defm V_FMA_MIX_BF16_t16 : VOP3_VOP3PInst_t16<"v_fma_mix_bf16_t16", VOP3P_Mix_Profile_t16<VOP_F16_F16_F16_F16, VOP3_OPSEL>>;
381
427
} // End FPDPRounding = 1
382
428
} // End isCommutable = 1
383
429
384
- defm : MadFmaMixPats<fma, V_FMA_MIX_F32_BF16, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
430
+ defm : MadFmaMixFP32Pats<fma, V_FMA_MIX_F32_BF16, bf16>;
431
+ foreach p = [NotHasTrue16BitInsts, UseFakeTrue16Insts] in
432
+ let True16Predicate = p in
433
+ defm : MadFmaMixFP16Pats<fma, V_FMA_MIXLO_BF16, V_FMA_MIXHI_BF16, bf16, v2bf16>;
434
+ let True16Predicate = UseRealTrue16Insts in
435
+ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
385
436
} // End SubtargetPredicate = HasFmaMixBF16Insts
386
437
387
438
def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
0 commit comments