diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 2dca8c0da4756..70b229294b920 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4582,6 +4582,10 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1; if (VT.isScalableVector()) { + // Let common code split the operation. + if (SrcVT == MVT::nxv8f32) + return Op; + if (VT.getScalarType() != MVT::bf16) return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); @@ -4724,6 +4728,22 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, assert(!(IsStrict && VT.isScalableVector()) && "Unimplemented SVE support for STRICT_FP_to_INT!"); + // f16 conversions are promoted to f32 when full fp16 is not supported. + if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) || + InVT.getVectorElementType() == MVT::bf16) { + EVT NewVT = VT.changeElementType(MVT::f32); + SDLoc dl(Op); + if (IsStrict) { + SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other}, + {Op.getOperand(0), Op.getOperand(1)}); + return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, + {Ext.getValue(1), Ext.getValue(0)}); + } + return DAG.getNode( + Op.getOpcode(), dl, Op.getValueType(), + DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); + } + if (VT.isScalableVector()) { if (VT.getVectorElementType() == MVT::i1) { SDLoc DL(Op); @@ -4733,6 +4753,10 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE); } + // Let common code split the operation. + if (InVT == MVT::nxv8f32) + return Op; + unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT ? AArch64ISD::FCVTZU_MERGE_PASSTHRU : AArch64ISD::FCVTZS_MERGE_PASSTHRU; @@ -4743,24 +4767,6 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPToIntToSVE(Op, DAG); - unsigned NumElts = InVT.getVectorNumElements(); - - // f16 conversions are promoted to f32 when full fp16 is not supported. - if ((InVT.getVectorElementType() == MVT::f16 && !Subtarget->hasFullFP16()) || - InVT.getVectorElementType() == MVT::bf16) { - MVT NewVT = MVT::getVectorVT(MVT::f32, NumElts); - SDLoc dl(Op); - if (IsStrict) { - SDValue Ext = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NewVT, MVT::Other}, - {Op.getOperand(0), Op.getOperand(1)}); - return DAG.getNode(Op.getOpcode(), dl, {VT, MVT::Other}, - {Ext.getValue(1), Ext.getValue(0)}); - } - return DAG.getNode( - Op.getOpcode(), dl, Op.getValueType(), - DAG.getNode(ISD::FP_EXTEND, dl, NewVT, Op.getOperand(0))); - } - uint64_t VTSize = VT.getFixedSizeInBits(); uint64_t InVTSize = InVT.getFixedSizeInBits(); if (VTSize < InVTSize) { @@ -4795,7 +4801,7 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op, // Use a scalar operation for conversions between single-element vectors of // the same size. - if (NumElts == 1) { + if (InVT.getVectorNumElements() == 1) { SDLoc dl(Op); SDValue Extract = DAG.getNode( ISD::EXTRACT_VECTOR_ELT, dl, InVT.getScalarType(), @@ -5041,23 +5047,14 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, assert(!(IsStrict && VT.isScalableVector()) && "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!"); - if (VT.isScalableVector()) { - if (InVT.getVectorElementType() == MVT::i1) { - SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT); - SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT) - : DAG.getConstantFP(1.0, dl, VT); - return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal); - } - - unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU - : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; - return LowerToPredicatedOp(Op, DAG, Opcode); + // NOTE: i1->bf16 does not require promotion to f32. + if (VT.isScalableVector() && InVT.getVectorElementType() == MVT::i1) { + SDValue FalseVal = DAG.getConstantFP(0.0, dl, VT); + SDValue TrueVal = IsSigned ? DAG.getConstantFP(-1.0, dl, VT) + : DAG.getConstantFP(1.0, dl, VT); + return DAG.getNode(ISD::VSELECT, dl, VT, In, TrueVal, FalseVal); } - if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || - useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) - return LowerFixedLengthIntToFPToSVE(Op, DAG); - // Promote bf16 conversions to f32. if (VT.getVectorElementType() == MVT::bf16) { EVT F32 = VT.changeElementType(MVT::f32); @@ -5074,6 +5071,20 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, DAG.getIntPtrConstant(0, dl, /*isTarget=*/true)); } + if (VT.isScalableVector()) { + // Let common code split the operation. + if (VT == MVT::nxv8f32) + return Op; + + unsigned Opcode = IsSigned ? AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU + : AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU; + return LowerToPredicatedOp(Op, DAG, Opcode); + } + + if (useSVEForFixedLengthVectorVT(VT, !Subtarget->isNeonAvailable()) || + useSVEForFixedLengthVectorVT(InVT, !Subtarget->isNeonAvailable())) + return LowerFixedLengthIntToFPToSVE(Op, DAG); + uint64_t VTSize = VT.getFixedSizeInBits(); uint64_t InVTSize = InVT.getFixedSizeInBits(); if (VTSize < InVTSize) { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 8d2e7f4a8ed10..eafaf1717902e 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -5465,6 +5465,14 @@ multiclass sve_int_dup_fpimm_pred { (!cast(NAME # _S) $zd, $pg, fpimm32:$imm8)>; def : Pat<(nxv2f64 (vselect nxv2i1:$pg, (splat_vector fpimm64:$imm8), nxv2f64:$zd)), (!cast(NAME # _D) $zd, $pg, fpimm64:$imm8)>; + + // Some half precision immediates alias with bfloat (e.g. f16(1.875) == bf16(1.0)). + def : Pat<(nxv8bf16 (vselect nxv8i1:$pg, (splat_vector fpimmbf16:$imm8), nxv8bf16:$zd)), + (!cast(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>; + def : Pat<(nxv4bf16 (vselect nxv4i1:$pg, (splat_vector fpimmbf16:$imm8), nxv4bf16:$zd)), + (!cast(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>; + def : Pat<(nxv2bf16 (vselect nxv2i1:$pg, (splat_vector fpimmbf16:$imm8), nxv2bf16:$zd)), + (!cast(NAME # _H) $zd, $pg, (fpimm16XForm bf16:$imm8))>; } class sve_int_dup_imm_pred sz8_64, bit m, string asm, diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll new file mode 100644 index 0000000000000..d6484c2483f49 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-bf16-int-converts.ll @@ -0,0 +1,816 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @fptosi_nxv2bf16_to_nxv2i1( %a) { +; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv2bf16_to_nxv2i8( %a) { +; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv2bf16_to_nxv2i16( %a) { +; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv2bf16_to_nxv2i32( %a) { +; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv2bf16_to_nxv2i64( %a) { +; CHECK-LABEL: fptosi_nxv2bf16_to_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv4bf16_to_nxv4i1( %a) { +; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv4bf16_to_nxv4i8( %a) { +; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv4bf16_to_nxv4i16( %a) { +; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv4bf16_to_nxv4i32( %a) { +; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv4bf16_to_nxv4i64( %a) { +; CHECK-LABEL: fptosi_nxv4bf16_to_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z0.s, #16 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z1.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z2.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv8bf16_to_nxv8i1( %a) { +; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv8bf16_to_nxv8i8( %a) { +; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv8bf16_to_nxv8i16( %a) { +; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv8bf16_to_nxv8i32( %a) { +; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z0.s, #16 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzs z0.s, p0/m, z1.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z2.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptosi_nxv8bf16_to_nxv8i64( %a) { +; CHECK-LABEL: fptosi_nxv8bf16_to_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z2.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z2.s, #16 +; CHECK-NEXT: lsl z3.s, z3.s, #16 +; CHECK-NEXT: lsl z4.s, z0.s, #16 +; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzs z0.d, p0/m, z2.s +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: fcvtzs z2.d, p0/m, z3.s +; CHECK-NEXT: movprfx z3, z4 +; CHECK-NEXT: fcvtzs z3.d, p0/m, z4.s +; CHECK-NEXT: ret + %res = fptosi %a to + ret %res +} + +define @fptoui_nxv2bf16_to_nxv2i1( %a) { +; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i1: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +; NOTE: Using fcvtzs is safe as fptoui overflow is considered poison and a +; 64bit signed value encompasses the entire range of a 16bit unsigned value. +define @fptoui_nxv2bf16_to_nxv2i8( %a) { +; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv2bf16_to_nxv2i16( %a) { +; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv2bf16_to_nxv2i32( %a) { +; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv2bf16_to_nxv2i64( %a) { +; CHECK-LABEL: fptoui_nxv2bf16_to_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv4bf16_to_nxv4i1( %a) { +; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i1: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0 +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv4bf16_to_nxv4i8( %a) { +; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv4bf16_to_nxv4i16( %a) { +; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv4bf16_to_nxv4i32( %a) { +; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv4bf16_to_nxv4i64( %a) { +; CHECK-LABEL: fptoui_nxv4bf16_to_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z0.s, #16 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z1.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzu z1.d, p0/m, z2.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv8bf16_to_nxv8i1( %a) { +; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i1: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0 +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv8bf16_to_nxv8i8( %a) { +; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv8bf16_to_nxv8i16( %a) { +; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z0.s, z0.s, #16 +; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s +; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv8bf16_to_nxv8i32( %a) { +; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z0.s, #16 +; CHECK-NEXT: movprfx z0, z1 +; CHECK-NEXT: fcvtzu z0.s, p0/m, z1.s +; CHECK-NEXT: movprfx z1, z2 +; CHECK-NEXT: fcvtzu z1.s, p0/m, z2.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +define @fptoui_nxv8bf16_to_nxv8i64( %a) { +; CHECK-LABEL: fptoui_nxv8bf16_to_nxv8i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z2.d, z1.s +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: uunpklo z3.d, z0.s +; CHECK-NEXT: uunpkhi z0.d, z0.s +; CHECK-NEXT: lsl z1.s, z1.s, #16 +; CHECK-NEXT: lsl z2.s, z2.s, #16 +; CHECK-NEXT: lsl z3.s, z3.s, #16 +; CHECK-NEXT: lsl z4.s, z0.s, #16 +; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.s +; CHECK-NEXT: movprfx z0, z2 +; CHECK-NEXT: fcvtzu z0.d, p0/m, z2.s +; CHECK-NEXT: movprfx z2, z3 +; CHECK-NEXT: fcvtzu z2.d, p0/m, z3.s +; CHECK-NEXT: movprfx z3, z4 +; CHECK-NEXT: fcvtzu z3.d, p0/m, z4.s +; CHECK-NEXT: ret + %res = fptoui %a to + ret %res +} + +; NOTE: f16(-1.875) == bf16(-1.0) +define @sitofp_nxv2i1_to_nxv2bf16( %a) { +; CHECK-LABEL: sitofp_nxv2i1_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000 +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv2i8_to_nxv2bf16( %a) { +; CHECK-LABEL: sitofp_nxv2i8_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxtb z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv2i16_to_nxv2bf16( %a) { +; CHECK-LABEL: sitofp_nxv2i16_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: sxth z0.d, p0/m, z0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv2i32_to_nxv2bf16( %a) { +; CHECK-LABEL: sitofp_nxv2i32_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv2i64_to_nxv2bf16( %a) { +; CHECK-LABEL: sitofp_nxv2i64_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv4i1_to_nxv4bf16( %a) { +; CHECK-LABEL: sitofp_nxv4i1_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000 +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv4i8_to_nxv4bf16( %a) { +; CHECK-LABEL: sitofp_nxv4i8_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxtb z0.s, p0/m, z0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv4i16_to_nxv4bf16( %a) { +; CHECK-LABEL: sitofp_nxv4i16_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sxth z0.s, p0/m, z0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv4i32_to_nxv4bf16( %a) { +; CHECK-LABEL: sitofp_nxv4i32_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv4i64_to_nxv4bf16( %a) { +; CHECK-LABEL: sitofp_nxv4i64_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z1.s, p0/m, z1.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv8i1_to_nxv8bf16( %a) { +; CHECK-LABEL: sitofp_nxv8i1_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fmov z0.h, p0/m, #-1.87500000 +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv8i8_to_nxv8bf16( %a) { +; CHECK-LABEL: sitofp_nxv8i8_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: sxtb z0.h, p0/m, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: sunpkhi z1.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: scvtf z1.s, p0/m, z1.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv8i16_to_nxv8bf16( %a) { +; CHECK-LABEL: sitofp_nxv8i16_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpkhi z1.s, z0.h +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z1.s, p0/m, z1.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv8i32_to_nxv8bf16( %a) { +; CHECK-LABEL: sitofp_nxv8i32_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: scvtf z1.s, p0/m, z1.s +; CHECK-NEXT: scvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +define @sitofp_nxv8i64_to_nxv8bf16( %a) { +; CHECK-LABEL: sitofp_nxv8i64_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: scvtf z3.s, p0/m, z3.d +; CHECK-NEXT: scvtf z2.s, p0/m, z2.d +; CHECK-NEXT: scvtf z1.s, p0/m, z1.d +; CHECK-NEXT: scvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z3.h, p0/m, z3.s +; CHECK-NEXT: bfcvt z2.h, p0/m, z2.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %res = sitofp %a to + ret %res +} + +; NOTE: f16(1.875) == bf16(1.0) +define @uitofp_nxv2i1_to_nxv2bf16( %a) { +; CHECK-LABEL: uitofp_nxv2i1_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000 +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv2i8_to_nxv2bf16( %a) { +; CHECK-LABEL: uitofp_nxv2i8_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv2i16_to_nxv2bf16( %a) { +; CHECK-LABEL: uitofp_nxv2i16_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.d, z0.d, #0xffff +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv2i32_to_nxv2bf16( %a) { +; CHECK-LABEL: uitofp_nxv2i32_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv2i64_to_nxv2bf16( %a) { +; CHECK-LABEL: uitofp_nxv2i64_to_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv4i1_to_nxv4bf16( %a) { +; CHECK-LABEL: uitofp_nxv4i1_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000 +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv4i8_to_nxv4bf16( %a) { +; CHECK-LABEL: uitofp_nxv4i8_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.s, z0.s, #0xff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv4i16_to_nxv4bf16( %a) { +; CHECK-LABEL: uitofp_nxv4i16_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.s, z0.s, #0xffff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv4i32_to_nxv4bf16( %a) { +; CHECK-LABEL: uitofp_nxv4i32_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv4i64_to_nxv4bf16( %a) { +; CHECK-LABEL: uitofp_nxv4i64_to_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv8i1_to_nxv8bf16( %a) { +; CHECK-LABEL: uitofp_nxv8i1_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z0.h, #0 // =0x0 +; CHECK-NEXT: fmov z0.h, p0/m, #1.87500000 +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv8i8_to_nxv8bf16( %a) { +; CHECK-LABEL: uitofp_nxv8i8_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: and z0.h, z0.h, #0xff +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv8i16_to_nxv8bf16( %a) { +; CHECK-LABEL: uitofp_nxv8i16_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv8i32_to_nxv8bf16( %a) { +; CHECK-LABEL: uitofp_nxv8i32_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.s +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +} + +define @uitofp_nxv8i64_to_nxv8bf16( %a) { +; CHECK-LABEL: uitofp_nxv8i64_to_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ucvtf z3.s, p0/m, z3.d +; CHECK-NEXT: ucvtf z2.s, p0/m, z2.d +; CHECK-NEXT: ucvtf z1.s, p0/m, z1.d +; CHECK-NEXT: ucvtf z0.s, p0/m, z0.d +; CHECK-NEXT: bfcvt z3.h, p0/m, z3.s +; CHECK-NEXT: bfcvt z2.h, p0/m, z2.s +; CHECK-NEXT: bfcvt z1.h, p0/m, z1.s +; CHECK-NEXT: bfcvt z0.h, p0/m, z0.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h +; CHECK-NEXT: ret + %res = uitofp %a to + ret %res +}