Skip to content

Commit f0ec9e1

Browse files
committed
[AMDGPU][True16][CodeGen] fp conversion instructions in true/fake16 format
1 parent ae5bd2a commit f0ec9e1

18 files changed

+1161
-597
lines changed

llvm/lib/Target/AMDGPU/SIInstrInfo.cpp

Lines changed: 19 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -7361,14 +7361,25 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
73617361
const DebugLoc &DL = Inst.getDebugLoc();
73627362
Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
73637363
Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
7364-
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7365-
.addImm(16)
7366-
.add(Inst.getOperand(1));
7367-
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7368-
.addImm(0) // src0_modifiers
7369-
.addReg(TmpReg)
7370-
.addImm(0) // clamp
7371-
.addImm(0); // omod
7364+
if (ST.useRealTrue16Insts()) {
7365+
BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
7366+
.add(Inst.getOperand(1));
7367+
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7368+
.addImm(0) // src0_modifiers
7369+
.addReg(TmpReg, 0, AMDGPU::hi16)
7370+
.addImm(0) // clamp
7371+
.addImm(0) // omod
7372+
.addImm(0); // op_sel0
7373+
} else {
7374+
BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
7375+
.addImm(16)
7376+
.add(Inst.getOperand(1));
7377+
BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
7378+
.addImm(0) // src0_modifiers
7379+
.addReg(TmpReg)
7380+
.addImm(0) // clamp
7381+
.addImm(0); // omod
7382+
}
73727383

73737384
MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
73747385
addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);

llvm/lib/Target/AMDGPU/SIInstrInfo.td

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2647,6 +2647,7 @@ class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> {
26472647
// Most DstVT are 16-bit, but not all
26482648
let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
26492649
let DstRC64 = getVALUDstForVT<DstVT>.ret;
2650+
let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
26502651
let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
26512652
let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
26522653
let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;

llvm/lib/Target/AMDGPU/SIInstructions.td

Lines changed: 62 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -1094,7 +1094,7 @@ def : Pat <
10941094
// VOP1 Patterns
10951095
//===----------------------------------------------------------------------===//
10961096

1097-
multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
1097+
multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
10981098
// f16_to_fp patterns
10991099
def : GCNPat <
11001100
(f32 (any_f16_to_fp i32:$src0)),
@@ -1121,25 +1121,42 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
11211121
(cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
11221122
>;
11231123

1124+
// fp_to_fp16 patterns
11241125
def : GCNPat <
1125-
(f64 (any_fpextend f16:$src)),
1126-
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
1126+
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1127+
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
11271128
>;
11281129

1129-
// fp_to_fp16 patterns
1130+
// This is only used on targets without half support
1131+
// TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
11301132
def : GCNPat <
1131-
(i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1133+
(i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
11321134
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
11331135
>;
1136+
}
1137+
1138+
let SubtargetPredicate = NotHasTrue16BitInsts in
1139+
defm : f16_to_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
1140+
1141+
let SubtargetPredicate = UseFakeTrue16Insts in
1142+
defm : f16_to_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
1143+
1144+
multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64,
1145+
Instruction cvt_f32_f16_inst_e64,
1146+
RegOrImmOperand VSrc> {
1147+
def : GCNPat <
1148+
(f64 (any_fpextend f16:$src)),
1149+
(V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
1150+
>;
11341151

11351152
def : GCNPat <
11361153
(i32 (fp_to_sint f16:$src)),
1137-
(V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
1154+
(V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
11381155
>;
11391156

11401157
def : GCNPat <
11411158
(i32 (fp_to_uint f16:$src)),
1142-
(V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
1159+
(V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
11431160
>;
11441161

11451162
def : GCNPat <
@@ -1151,20 +1168,16 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
11511168
(f16 (uint_to_fp i32:$src)),
11521169
(cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
11531170
>;
1154-
1155-
// This is only used on targets without half support
1156-
// TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
1157-
def : GCNPat <
1158-
(i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1159-
(cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
1160-
>;
11611171
}
11621172

1163-
let True16Predicate = NotHasTrue16BitInsts in
1164-
defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
1173+
let SubtargetPredicate = NotHasTrue16BitInsts in
1174+
defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64, VSrc_b32>;
1175+
1176+
let SubtargetPredicate = UseRealTrue16Insts in
1177+
defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>;
11651178

1166-
let True16Predicate = UseFakeTrue16Insts in
1167-
defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
1179+
let SubtargetPredicate = UseFakeTrue16Insts in
1180+
defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>;
11681181

11691182
//===----------------------------------------------------------------------===//
11701183
// VOP2 Patterns
@@ -2774,13 +2787,24 @@ def : GCNPat <
27742787
SSrc_i1:$src))
27752788
>;
27762789

2777-
let SubtargetPredicate = HasTrue16BitInsts in
2790+
let SubtargetPredicate = UseRealTrue16Insts in
27782791
def : GCNPat <
27792792
(f16 (sint_to_fp i1:$src)),
2780-
(V_CVT_F16_F32_fake16_e32 (
2781-
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2793+
(V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
2794+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
27822795
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
2783-
SSrc_i1:$src))
2796+
SSrc_i1:$src),
2797+
/*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
2798+
>;
2799+
2800+
let SubtargetPredicate = UseFakeTrue16Insts in
2801+
def : GCNPat <
2802+
(f16 (sint_to_fp i1:$src)),
2803+
(V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
2804+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2805+
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
2806+
SSrc_i1:$src),
2807+
/*clamp*/ 0, /*omod*/ 0)
27842808
>;
27852809

27862810
let SubtargetPredicate = NotHasTrue16BitInsts in
@@ -2791,13 +2815,25 @@ def : GCNPat <
27912815
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
27922816
SSrc_i1:$src))
27932817
>;
2794-
let SubtargetPredicate = HasTrue16BitInsts in
2818+
2819+
let SubtargetPredicate = UseRealTrue16Insts in
27952820
def : GCNPat <
27962821
(f16 (uint_to_fp i1:$src)),
2797-
(V_CVT_F16_F32_fake16_e32 (
2798-
V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2822+
(V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
2823+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
27992824
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
2800-
SSrc_i1:$src))
2825+
SSrc_i1:$src),
2826+
/*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
2827+
>;
2828+
2829+
let SubtargetPredicate = UseFakeTrue16Insts in
2830+
def : GCNPat <
2831+
(f16 (uint_to_fp i1:$src)),
2832+
(V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
2833+
(V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
2834+
/*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
2835+
SSrc_i1:$src),
2836+
/*clamp*/ 0, /*omod*/ 0)
28012837
>;
28022838

28032839
def : GCNPat <

llvm/lib/Target/AMDGPU/VOP1Instructions.td

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -513,7 +513,7 @@ def : GCNPat<
513513
(V_CVT_F16_F32_e32 $src)
514514
>;
515515
}
516-
let OtherPredicates = [HasTrue16BitInsts] in {
516+
let OtherPredicates = [UseRealTrue16Insts] in {
517517
def : GCNPat<
518518
(f32 (f16_to_fp i16:$src)),
519519
(V_CVT_F32_F16_t16_e32 $src)
@@ -523,6 +523,16 @@ def : GCNPat<
523523
(V_CVT_F16_F32_t16_e32 $src)
524524
>;
525525
}
526+
let OtherPredicates = [UseFakeTrue16Insts] in {
527+
def : GCNPat<
528+
(f32 (f16_to_fp i16:$src)),
529+
(V_CVT_F32_F16_fake16_e32 $src)
530+
>;
531+
def : GCNPat<
532+
(i16 (AMDGPUfp_to_f16 f32:$src)),
533+
(V_CVT_F16_F32_fake16_e32 $src)
534+
>;
535+
}
526536

527537
def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
528538
let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1);
@@ -1421,7 +1431,6 @@ def : GCNPat<
14211431
>;
14221432
} // End OtherPredicates = [UseFakeTrue16Insts]
14231433

1424-
14251434
let OtherPredicates = [UseRealTrue16Insts] in {
14261435
def : GCNPat<
14271436
(i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
@@ -1447,9 +1456,7 @@ def : GCNPat <
14471456
(i16 (trunc i64:$src)),
14481457
(EXTRACT_SUBREG $src, lo16)
14491458
>;
1450-
14511459
} // End OtherPredicates = [UseRealTrue16Insts]
1452-
14531460
//===----------------------------------------------------------------------===//
14541461
// GFX9
14551462
//===----------------------------------------------------------------------===//

0 commit comments

Comments
 (0)