Skip to content

Commit f5fd645

Browse files
jayfoadgithub-actions[bot]
authored andcommitted
Automerge: [AMDGPU] Implement codegen for GFX11+ V_CVT_PK_[IU]16_F32 (#168719)
2 parents 141cfd2 + 6ae0b9f commit f5fd645

File tree

6 files changed

+127
-242
lines changed

6 files changed

+127
-242
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6891,6 +6891,12 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
68916891
return splitTernaryVectorOp(Op, DAG);
68926892
case ISD::FP_TO_SINT:
68936893
case ISD::FP_TO_UINT:
6894+
if (Subtarget->getGeneration() >= AMDGPUSubtarget::GFX11 &&
6895+
Op.getValueType() == MVT::i16 &&
6896+
Op.getOperand(0).getValueType() == MVT::f32) {
6897+
// Make f32->i16 legal so we can select V_CVT_PK_[IU]16_F32.
6898+
return Op;
6899+
}
68946900
return LowerFP_TO_INT(Op, DAG);
68956901
case ISD::SHL:
68966902
case ISD::SRA:

llvm/lib/Target/AMDGPU/SOPInstructions.td

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -469,6 +469,14 @@ let SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE],
469469
} // End SubtargetPredicate = HasSALUFloatInsts, Uses = [MODE]
470470
// SchedRW = [WriteSFPU], isReMaterializable = 1
471471

472+
let SubtargetPredicate = HasSALUFloatInsts, AddedComplexity = 9 in {
473+
// Fallback patterns for f32->i16 conversion.
474+
def : GCNPat<(i16 (UniformUnaryFrag<fp_to_sint> f32:$src0)),
475+
(S_CVT_I32_F32 $src0)>;
476+
def : GCNPat<(i16 (UniformUnaryFrag<fp_to_uint> f32:$src0)),
477+
(S_CVT_U32_F32 $src0)>;
478+
}
479+
472480
let hasSideEffects = 1 in {
473481
let has_sdst = 0 in {
474482
let Uses = [M0] in {

llvm/lib/Target/AMDGPU/VOP3Instructions.td

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1721,6 +1721,28 @@ let SubtargetPredicate = isGFX11Plus in {
17211721
defm V_MINMAX_I32 : VOP3Inst<"v_minmax_i32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
17221722
defm V_CVT_PK_I16_F32 : VOP3Inst<"v_cvt_pk_i16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
17231723
defm V_CVT_PK_U16_F32 : VOP3Inst<"v_cvt_pk_u16_f32", VOP3_Profile<VOP_V2I16_F32_F32>>;
1724+
1725+
def : GCNPat<(v2i16 (build_vector (i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1726+
(i16 (fp_to_sint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
1727+
(V_CVT_PK_I16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
1728+
def : GCNPat<(v2i16 (build_vector (i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1729+
(i16 (fp_to_uint (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)))))),
1730+
(V_CVT_PK_U16_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1)>;
1731+
1732+
// Fallback patterns for f32->i16 conversion. These are only required because
1733+
// f32->i16 has to be legal so that we can select V_CVT_PK_[IU]16_F32 above.
1734+
let True16Predicate = UseRealTrue16Insts in {
1735+
def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1736+
(EXTRACT_SUBREG (V_CVT_I32_F32_e64 $src0_modifiers, $src0), lo16)>;
1737+
def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1738+
(EXTRACT_SUBREG (V_CVT_U32_F32_e64 $src0_modifiers, $src0), lo16)>;
1739+
}
1740+
let True16Predicate = NotUseRealTrue16Insts in {
1741+
def : GCNPat<(i16 (fp_to_sint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1742+
(V_CVT_I32_F32_e64 $src0_modifiers, $src0)>;
1743+
def : GCNPat<(i16 (fp_to_uint (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
1744+
(V_CVT_U32_F32_e64 $src0_modifiers, $src0)>;
1745+
}
17241746
} // End SubtargetPredicate = isGFX11Plus
17251747

17261748
class VOP3_CVT_SR_FP16_TiedInput_Profile<VOPProfile P> : VOP3_CVT_SCALE_F1632_FP8BF8_TiedInput_Profile<P> {

llvm/test/CodeGen/AMDGPU/bf16.ll

Lines changed: 59 additions & 172 deletions
Original file line numberDiff line numberDiff line change
@@ -35518,55 +35518,24 @@ define <2 x i16> @v_fptosi_v2bf16_to_v2i16(<2 x bfloat> %x) {
3551835518
; GFX10-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
3551935519
; GFX10-NEXT: s_setpc_b64 s[30:31]
3552035520
;
35521-
; GFX11TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
35522-
; GFX11TRUE16: ; %bb.0:
35523-
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35524-
; GFX11TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
35525-
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35526-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35527-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35528-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35529-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
35530-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
35531-
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
35532-
;
35533-
; GFX11FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
35534-
; GFX11FAKE16: ; %bb.0:
35535-
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35536-
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
35537-
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35538-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35539-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35540-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35541-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
35542-
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
35543-
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
35544-
;
35545-
; GFX1250TRUE16-LABEL: v_fptosi_v2bf16_to_v2i16:
35546-
; GFX1250TRUE16: ; %bb.0:
35547-
; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
35548-
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
35549-
; GFX1250TRUE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
35550-
; GFX1250TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35551-
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35552-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35553-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35554-
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
35555-
; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v1.l
35556-
; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
35521+
; GFX11-LABEL: v_fptosi_v2bf16_to_v2i16:
35522+
; GFX11: ; %bb.0:
35523+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35524+
; GFX11-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
35525+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35526+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
35527+
; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v1
35528+
; GFX11-NEXT: s_setpc_b64 s[30:31]
3555735529
;
35558-
; GFX1250FAKE16-LABEL: v_fptosi_v2bf16_to_v2i16:
35559-
; GFX1250FAKE16: ; %bb.0:
35560-
; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
35561-
; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
35562-
; GFX1250FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v0
35563-
; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35564-
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35565-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35566-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35567-
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
35568-
; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x5040100
35569-
; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
35530+
; GFX1250-LABEL: v_fptosi_v2bf16_to_v2i16:
35531+
; GFX1250: ; %bb.0:
35532+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
35533+
; GFX1250-NEXT: s_wait_kmcnt 0x0
35534+
; GFX1250-NEXT: v_and_b32_e32 v1, 0xffff0000, v0
35535+
; GFX1250-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35536+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1)
35537+
; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v1
35538+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
3557035539
%op = fptosi <2 x bfloat> %x to <2 x i16>
3557135540
ret <2 x i16> %op
3557235541
}
@@ -35660,61 +35629,27 @@ define <3 x i16> @v_fptosi_v3bf16_to_v3i16(<3 x bfloat> %x) {
3566035629
; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
3566135630
; GFX10-NEXT: s_setpc_b64 s[30:31]
3566235631
;
35663-
; GFX11TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
35664-
; GFX11TRUE16: ; %bb.0:
35665-
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35666-
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
35667-
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35668-
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35669-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
35670-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35671-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35672-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
35673-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35674-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
35675-
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
35676-
;
35677-
; GFX11FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
35678-
; GFX11FAKE16: ; %bb.0:
35679-
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35680-
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v0
35681-
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35682-
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35683-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
35684-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35685-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35686-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
35687-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35688-
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
35689-
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
35690-
;
35691-
; GFX1250TRUE16-LABEL: v_fptosi_v3bf16_to_v3i16:
35692-
; GFX1250TRUE16: ; %bb.0:
35693-
; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
35694-
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
35695-
; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
35696-
; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
35697-
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35698-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35699-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35700-
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
35701-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35702-
; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
35703-
; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
35632+
; GFX11-LABEL: v_fptosi_v3bf16_to_v3i16:
35633+
; GFX11: ; %bb.0:
35634+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35635+
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
35636+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35637+
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35638+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35639+
; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v2
35640+
; GFX11-NEXT: v_cvt_i32_f32_e32 v1, v1
35641+
; GFX11-NEXT: s_setpc_b64 s[30:31]
3570435642
;
35705-
; GFX1250FAKE16-LABEL: v_fptosi_v3bf16_to_v3i16:
35706-
; GFX1250FAKE16: ; %bb.0:
35707-
; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
35708-
; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
35709-
; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
35710-
; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35711-
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
35712-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35713-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35714-
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
35715-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35716-
; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v2, 0x5040100
35717-
; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
35643+
; GFX1250-LABEL: v_fptosi_v3bf16_to_v3i16:
35644+
; GFX1250: ; %bb.0:
35645+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
35646+
; GFX1250-NEXT: s_wait_kmcnt 0x0
35647+
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
35648+
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
35649+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
35650+
; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v2
35651+
; GFX1250-NEXT: v_cvt_i32_f32_e32 v1, v1
35652+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
3571835653
%op = fptosi <3 x bfloat> %x to <3 x i16>
3571935654
ret <3 x i16> %op
3572035655
}
@@ -35832,77 +35767,29 @@ define <4 x i16> @v_fptosi_v4bf16_to_v4i16(<4 x bfloat> %x) {
3583235767
; GFX10-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
3583335768
; GFX10-NEXT: s_setpc_b64 s[30:31]
3583435769
;
35835-
; GFX11TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
35836-
; GFX11TRUE16: ; %bb.0:
35837-
; GFX11TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35838-
; GFX11TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
35839-
; GFX11TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
35840-
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35841-
; GFX11TRUE16-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35842-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
35843-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35844-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
35845-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
35846-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35847-
; GFX11TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35848-
; GFX11TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
35849-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
35850-
; GFX11TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
35851-
; GFX11TRUE16-NEXT: s_setpc_b64 s[30:31]
35852-
;
35853-
; GFX11FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
35854-
; GFX11FAKE16: ; %bb.0:
35855-
; GFX11FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35856-
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v2, 16, v1
35857-
; GFX11FAKE16-NEXT: v_lshlrev_b32_e32 v3, 16, v0
35858-
; GFX11FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35859-
; GFX11FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35860-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
35861-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35862-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
35863-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
35864-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35865-
; GFX11FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35866-
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35867-
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
35868-
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
35869-
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
35870-
;
35871-
; GFX1250TRUE16-LABEL: v_fptosi_v4bf16_to_v4i16:
35872-
; GFX1250TRUE16: ; %bb.0:
35873-
; GFX1250TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0
35874-
; GFX1250TRUE16-NEXT: s_wait_kmcnt 0x0
35875-
; GFX1250TRUE16-NEXT: v_and_b32_e32 v2, 0xffff0000, v0
35876-
; GFX1250TRUE16-NEXT: v_and_b32_e32 v3, 0xffff0000, v1
35877-
; GFX1250TRUE16-NEXT: v_dual_lshlrev_b32 v1, 16, v1 :: v_dual_lshlrev_b32 v0, 16, v0
35878-
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
35879-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35880-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v3, v3
35881-
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
35882-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35883-
; GFX1250TRUE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35884-
; GFX1250TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
35885-
; GFX1250TRUE16-NEXT: v_mov_b16_e32 v0.h, v2.l
35886-
; GFX1250TRUE16-NEXT: v_mov_b16_e32 v1.h, v3.l
35887-
; GFX1250TRUE16-NEXT: s_set_pc_i64 s[30:31]
35770+
; GFX11-LABEL: v_fptosi_v4bf16_to_v4i16:
35771+
; GFX11: ; %bb.0:
35772+
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
35773+
; GFX11-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
35774+
; GFX11-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
35775+
; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0
35776+
; GFX11-NEXT: v_lshlrev_b32_e32 v1, 16, v1
35777+
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35778+
; GFX11-NEXT: v_cvt_pk_i16_f32 v0, v0, v3
35779+
; GFX11-NEXT: v_cvt_pk_i16_f32 v1, v1, v2
35780+
; GFX11-NEXT: s_setpc_b64 s[30:31]
3588835781
;
35889-
; GFX1250FAKE16-LABEL: v_fptosi_v4bf16_to_v4i16:
35890-
; GFX1250FAKE16: ; %bb.0:
35891-
; GFX1250FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0
35892-
; GFX1250FAKE16-NEXT: s_wait_kmcnt 0x0
35893-
; GFX1250FAKE16-NEXT: v_dual_lshlrev_b32 v2, 16, v1 :: v_dual_lshlrev_b32 v3, 16, v0
35894-
; GFX1250FAKE16-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
35895-
; GFX1250FAKE16-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
35896-
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
35897-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v2, v2
35898-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v3, v3
35899-
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
35900-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v0, v0
35901-
; GFX1250FAKE16-NEXT: v_cvt_i32_f32_e32 v1, v1
35902-
; GFX1250FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
35903-
; GFX1250FAKE16-NEXT: v_perm_b32 v0, v0, v3, 0x5040100
35904-
; GFX1250FAKE16-NEXT: v_perm_b32 v1, v1, v2, 0x5040100
35905-
; GFX1250FAKE16-NEXT: s_set_pc_i64 s[30:31]
35782+
; GFX1250-LABEL: v_fptosi_v4bf16_to_v4i16:
35783+
; GFX1250: ; %bb.0:
35784+
; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
35785+
; GFX1250-NEXT: s_wait_kmcnt 0x0
35786+
; GFX1250-NEXT: v_and_b32_e32 v2, 0xffff0000, v1
35787+
; GFX1250-NEXT: v_and_b32_e32 v3, 0xffff0000, v0
35788+
; GFX1250-NEXT: v_dual_lshlrev_b32 v0, 16, v0 :: v_dual_lshlrev_b32 v1, 16, v1
35789+
; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
35790+
; GFX1250-NEXT: v_cvt_pk_i16_f32 v0, v0, v3
35791+
; GFX1250-NEXT: v_cvt_pk_i16_f32 v1, v1, v2
35792+
; GFX1250-NEXT: s_set_pc_i64 s[30:31]
3590635793
%op = fptosi <4 x bfloat> %x to <4 x i16>
3590735794
ret <4 x i16> %op
3590835795
}

llvm/test/CodeGen/AMDGPU/fp_to_sint.ll

Lines changed: 16 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -1241,27 +1241,16 @@ define amdgpu_kernel void @fp_to_sint_f32_i16(ptr addrspace(1) %out, float %in)
12411241
; VI-NEXT: buffer_store_short v0, off, s[0:3], 0
12421242
; VI-NEXT: s_endpgm
12431243
;
1244-
; GFX11-SDAG-LABEL: fp_to_sint_f32_i16:
1245-
; GFX11-SDAG: ; %bb.0:
1246-
; GFX11-SDAG-NEXT: s_clause 0x1
1247-
; GFX11-SDAG-NEXT: s_load_b32 s2, s[4:5], 0x2c
1248-
; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1249-
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
1250-
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
1251-
; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v1, s2
1252-
; GFX11-SDAG-NEXT: global_store_b16 v0, v1, s[0:1]
1253-
; GFX11-SDAG-NEXT: s_endpgm
1254-
;
1255-
; GFX11-GISEL-LABEL: fp_to_sint_f32_i16:
1256-
; GFX11-GISEL: ; %bb.0:
1257-
; GFX11-GISEL-NEXT: s_clause 0x1
1258-
; GFX11-GISEL-NEXT: s_load_b32 s2, s[4:5], 0x2c
1259-
; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1260-
; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0
1261-
; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
1262-
; GFX11-GISEL-NEXT: v_cvt_i32_f32_e32 v0, s2
1263-
; GFX11-GISEL-NEXT: global_store_b16 v1, v0, s[0:1]
1264-
; GFX11-GISEL-NEXT: s_endpgm
1244+
; GFX11-LABEL: fp_to_sint_f32_i16:
1245+
; GFX11: ; %bb.0:
1246+
; GFX11-NEXT: s_clause 0x1
1247+
; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c
1248+
; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24
1249+
; GFX11-NEXT: v_mov_b32_e32 v1, 0
1250+
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
1251+
; GFX11-NEXT: v_cvt_i32_f32_e32 v0, s2
1252+
; GFX11-NEXT: global_store_b16 v1, v0, s[0:1]
1253+
; GFX11-NEXT: s_endpgm
12651254
;
12661255
; EG-LABEL: fp_to_sint_f32_i16:
12671256
; EG: ; %bb.0:
@@ -1321,13 +1310,10 @@ define amdgpu_kernel void @fp_to_sint_v2f32_to_v2i16(ptr addrspace(1) %out, <2 x
13211310
; GFX11-SDAG-LABEL: fp_to_sint_v2f32_to_v2i16:
13221311
; GFX11-SDAG: ; %bb.0:
13231312
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1324-
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
1313+
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
13251314
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
1326-
; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v0, s3
1327-
; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v2, s2
1328-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
1329-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l
1330-
; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1]
1315+
; GFX11-SDAG-NEXT: v_cvt_pk_i16_f32 v1, s2, s3
1316+
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
13311317
; GFX11-SDAG-NEXT: s_endpgm
13321318
;
13331319
; GFX11-GISEL-LABEL: fp_to_sint_v2f32_to_v2i16:
@@ -1396,13 +1382,10 @@ define amdgpu_kernel void @fp_to_sint_f32_to_v2i16(ptr addrspace(1) %out, float
13961382
; GFX11-SDAG-LABEL: fp_to_sint_f32_to_v2i16:
13971383
; GFX11-SDAG: ; %bb.0:
13981384
; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
1399-
; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
1385+
; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0
14001386
; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
1401-
; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v0, s3
1402-
; GFX11-SDAG-NEXT: v_cvt_i32_f32_e32 v2, s2
1403-
; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2)
1404-
; GFX11-SDAG-NEXT: v_mov_b16_e32 v2.h, v0.l
1405-
; GFX11-SDAG-NEXT: global_store_b32 v1, v2, s[0:1]
1387+
; GFX11-SDAG-NEXT: v_cvt_pk_i16_f32 v1, s2, s3
1388+
; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1]
14061389
; GFX11-SDAG-NEXT: s_endpgm
14071390
;
14081391
; GFX11-GISEL-LABEL: fp_to_sint_f32_to_v2i16:
@@ -1444,5 +1427,3 @@ define amdgpu_kernel void @fp_to_sint_f32_to_v2i16(ptr addrspace(1) %out, float
14441427

14451428
attributes #0 = { nounwind }
14461429
attributes #1 = { nounwind readnone }
1447-
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
1448-
; GFX11: {{.*}}

0 commit comments

Comments
 (0)