diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index e55e9989e6565..dd7d36748f7ae 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -268,6 +268,7 @@ static bool isMergePassthruOpcode(unsigned Opc) { case AArch64ISD::FP_EXTEND_MERGE_PASSTHRU: case AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU: case AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU: + case AArch64ISD::FCVTX_MERGE_PASSTHRU: case AArch64ISD::FCVTZU_MERGE_PASSTHRU: case AArch64ISD::FCVTZS_MERGE_PASSTHRU: case AArch64ISD::FSQRT_MERGE_PASSTHRU: @@ -2622,6 +2623,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(AArch64ISD::FP_EXTEND_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::FCVTX_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FCVTZU_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FCVTZS_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::FSQRT_MERGE_PASSTHRU) @@ -4363,6 +4365,19 @@ SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, // Set the quiet bit. if (!DAG.isKnownNeverSNaN(SrcVal)) NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000)); + } else if (SrcVT == MVT::nxv2f64 && + (Subtarget->hasSVE2() || Subtarget->isStreamingSVEAvailable())) { + // Round to float without introducing rounding errors and try again. + SDValue Pg = getPredicateForVector(DAG, DL, MVT::nxv2f32); + Narrow = DAG.getNode(AArch64ISD::FCVTX_MERGE_PASSTHRU, DL, MVT::nxv2f32, + Pg, SrcVal, DAG.getUNDEF(MVT::nxv2f32)); + + SmallVector NewOps; + if (IsStrict) + NewOps.push_back(Op.getOperand(0)); + NewOps.push_back(Narrow); + NewOps.push_back(Op.getOperand(IsStrict ? 2 : 1)); + return DAG.getNode(Op.getOpcode(), DL, VT, NewOps, Op->getFlags()); } else return SDValue(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h index 480bf60360bf5..1bae7562f459a 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -158,6 +158,7 @@ enum NodeType : unsigned { FP_EXTEND_MERGE_PASSTHRU, UINT_TO_FP_MERGE_PASSTHRU, SINT_TO_FP_MERGE_PASSTHRU, + FCVTX_MERGE_PASSTHRU, FCVTZU_MERGE_PASSTHRU, FCVTZS_MERGE_PASSTHRU, SIGN_EXTEND_INREG_MERGE_PASSTHRU, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 76362768e0aa6..53d9473975a23 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -357,6 +357,7 @@ def AArch64fcvtr_mt : SDNode<"AArch64ISD::FP_ROUND_MERGE_PASSTHRU", SDT_AArch64 def AArch64fcvte_mt : SDNode<"AArch64ISD::FP_EXTEND_MERGE_PASSTHRU", SDT_AArch64FCVT>; def AArch64ucvtf_mt : SDNode<"AArch64ISD::UINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; def AArch64scvtf_mt : SDNode<"AArch64ISD::SINT_TO_FP_MERGE_PASSTHRU", SDT_AArch64FCVT>; +def AArch64fcvtx_mt : SDNode<"AArch64ISD::FCVTX_MERGE_PASSTHRU", SDT_AArch64FCVT>; def AArch64fcvtzu_mt : SDNode<"AArch64ISD::FCVTZU_MERGE_PASSTHRU", SDT_AArch64FCVT>; def AArch64fcvtzs_mt : SDNode<"AArch64ISD::FCVTZS_MERGE_PASSTHRU", SDT_AArch64FCVT>; @@ -3779,7 +3780,7 @@ let Predicates = [HasSVE2orSME, UseExperimentalZeroingPseudos] in { let Predicates = [HasSVE2orSME] in { // SVE2 floating-point convert precision defm FCVTXNT_ZPmZ : sve2_fp_convert_down_odd_rounding_top<"fcvtxnt", "int_aarch64_sve_fcvtxnt">; - defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx">; + defm FCVTX_ZPmZ : sve2_fp_convert_down_odd_rounding<"fcvtx", "int_aarch64_sve_fcvtx", AArch64fcvtx_mt>; defm FCVTNT_ZPmZ : sve2_fp_convert_down_narrow<"fcvtnt", "int_aarch64_sve_fcvtnt">; defm FCVTLT_ZPmZ : sve2_fp_convert_up_long<"fcvtlt", "int_aarch64_sve_fcvtlt">; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h index accfb49c6fbe3..9856415361e50 100644 --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -188,10 +188,14 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo { (hasSMEFA64() || (!isStreaming() && !isStreamingCompatible())); } - /// Returns true if the target has access to either the full range of SVE instructions, - /// or the streaming-compatible subset of SVE instructions. + /// Returns true if the target has access to the streaming-compatible subset + /// of SVE instructions. + bool isStreamingSVEAvailable() const { return hasSME() && isStreaming(); } + + /// Returns true if the target has access to either the full range of SVE + /// instructions, or the streaming-compatible subset of SVE instructions. bool isSVEorStreamingSVEAvailable() const { - return hasSVE() || (hasSME() && isStreaming()); + return hasSVE() || isStreamingSVEAvailable(); } unsigned getMinVectorRegisterBitWidth() const { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 13c2a90a963f8..121e19ac0397f 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -3059,9 +3059,11 @@ multiclass sve2_fp_un_pred_zeroing_hsd { def : SVE_1_Op_PassthruZero_Pat(NAME # _D_ZERO)>; } -multiclass sve2_fp_convert_down_odd_rounding { +multiclass sve2_fp_convert_down_odd_rounding { def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>; + def : SVE_3_Op_Pat(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast(NAME # _DtoS)>; + def : SVE_1_Op_Passthru_Pat(NAME # _DtoS)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve2-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve2-bf16-converts.ll new file mode 100644 index 0000000000000..e5d4e1e9bc7da --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2-bf16-converts.ll @@ -0,0 +1,201 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mattr=+sve2 < %s | FileCheck %s --check-prefixes=NOBF16 +; RUN: llc -mattr=+sve2 --enable-no-nans-fp-math < %s | FileCheck %s --check-prefixes=NOBF16NNAN +; RUN: llc -mattr=+sve2,+bf16 < %s | FileCheck %s --check-prefixes=BF16 +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=BF16 + +target triple = "aarch64-unknown-linux-gnu" + +define @fptrunc_nxv2f64_to_nxv2bf16( %a) { +; NOBF16-LABEL: fptrunc_nxv2f64_to_nxv2bf16: +; NOBF16: // %bb.0: +; NOBF16-NEXT: ptrue p0.d +; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff +; NOBF16-NEXT: fcvtx z0.s, p0/m, z0.d +; NOBF16-NEXT: lsr z2.s, z0.s, #16 +; NOBF16-NEXT: add z1.s, z0.s, z1.s +; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s +; NOBF16-NEXT: orr z0.s, z0.s, #0x400000 +; NOBF16-NEXT: and z2.s, z2.s, #0x1 +; NOBF16-NEXT: add z1.s, z2.s, z1.s +; NOBF16-NEXT: sel z0.s, p0, z0.s, z1.s +; NOBF16-NEXT: lsr z0.s, z0.s, #16 +; NOBF16-NEXT: ret +; +; NOBF16NNAN-LABEL: fptrunc_nxv2f64_to_nxv2bf16: +; NOBF16NNAN: // %bb.0: +; NOBF16NNAN-NEXT: ptrue p0.d +; NOBF16NNAN-NEXT: mov z1.s, #32767 // =0x7fff +; NOBF16NNAN-NEXT: fcvtx z0.s, p0/m, z0.d +; NOBF16NNAN-NEXT: lsr z2.s, z0.s, #16 +; NOBF16NNAN-NEXT: add z0.s, z0.s, z1.s +; NOBF16NNAN-NEXT: and z2.s, z2.s, #0x1 +; NOBF16NNAN-NEXT: add z0.s, z2.s, z0.s +; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16 +; NOBF16NNAN-NEXT: ret +; +; BF16-LABEL: fptrunc_nxv2f64_to_nxv2bf16: +; BF16: // %bb.0: +; BF16-NEXT: ptrue p0.d +; BF16-NEXT: fcvtx z0.s, p0/m, z0.d +; BF16-NEXT: bfcvt z0.h, p0/m, z0.s +; BF16-NEXT: ret + %res = fptrunc %a to + ret %res +} + +define @fptrunc_nxv4f64_to_nxv4bf16( %a) { +; NOBF16-LABEL: fptrunc_nxv4f64_to_nxv4bf16: +; NOBF16: // %bb.0: +; NOBF16-NEXT: ptrue p0.d +; NOBF16-NEXT: mov z2.s, #32767 // =0x7fff +; NOBF16-NEXT: fcvtx z1.s, p0/m, z1.d +; NOBF16-NEXT: fcvtx z0.s, p0/m, z0.d +; NOBF16-NEXT: lsr z3.s, z1.s, #16 +; NOBF16-NEXT: lsr z4.s, z0.s, #16 +; NOBF16-NEXT: add z5.s, z1.s, z2.s +; NOBF16-NEXT: add z2.s, z0.s, z2.s +; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s +; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s +; NOBF16-NEXT: orr z1.s, z1.s, #0x400000 +; NOBF16-NEXT: orr z0.s, z0.s, #0x400000 +; NOBF16-NEXT: and z3.s, z3.s, #0x1 +; NOBF16-NEXT: and z4.s, z4.s, #0x1 +; NOBF16-NEXT: add z3.s, z3.s, z5.s +; NOBF16-NEXT: add z2.s, z4.s, z2.s +; NOBF16-NEXT: sel z1.s, p1, z1.s, z3.s +; NOBF16-NEXT: sel z0.s, p0, z0.s, z2.s +; NOBF16-NEXT: lsr z1.s, z1.s, #16 +; NOBF16-NEXT: lsr z0.s, z0.s, #16 +; NOBF16-NEXT: uzp1 z0.s, z0.s, z1.s +; NOBF16-NEXT: ret +; +; NOBF16NNAN-LABEL: fptrunc_nxv4f64_to_nxv4bf16: +; NOBF16NNAN: // %bb.0: +; NOBF16NNAN-NEXT: ptrue p0.d +; NOBF16NNAN-NEXT: mov z2.s, #32767 // =0x7fff +; NOBF16NNAN-NEXT: fcvtx z1.s, p0/m, z1.d +; NOBF16NNAN-NEXT: fcvtx z0.s, p0/m, z0.d +; NOBF16NNAN-NEXT: lsr z3.s, z1.s, #16 +; NOBF16NNAN-NEXT: lsr z4.s, z0.s, #16 +; NOBF16NNAN-NEXT: add z1.s, z1.s, z2.s +; NOBF16NNAN-NEXT: add z0.s, z0.s, z2.s +; NOBF16NNAN-NEXT: and z3.s, z3.s, #0x1 +; NOBF16NNAN-NEXT: and z4.s, z4.s, #0x1 +; NOBF16NNAN-NEXT: add z1.s, z3.s, z1.s +; NOBF16NNAN-NEXT: add z0.s, z4.s, z0.s +; NOBF16NNAN-NEXT: lsr z1.s, z1.s, #16 +; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16 +; NOBF16NNAN-NEXT: uzp1 z0.s, z0.s, z1.s +; NOBF16NNAN-NEXT: ret +; +; BF16-LABEL: fptrunc_nxv4f64_to_nxv4bf16: +; BF16: // %bb.0: +; BF16-NEXT: ptrue p0.d +; BF16-NEXT: fcvtx z1.s, p0/m, z1.d +; BF16-NEXT: fcvtx z0.s, p0/m, z0.d +; BF16-NEXT: bfcvt z1.h, p0/m, z1.s +; BF16-NEXT: bfcvt z0.h, p0/m, z0.s +; BF16-NEXT: uzp1 z0.s, z0.s, z1.s +; BF16-NEXT: ret + %res = fptrunc %a to + ret %res +} + +define @fptrunc_nxv8f64_to_nxv8bf16( %a) { +; NOBF16-LABEL: fptrunc_nxv8f64_to_nxv8bf16: +; NOBF16: // %bb.0: +; NOBF16-NEXT: ptrue p0.d +; NOBF16-NEXT: mov z4.s, #32767 // =0x7fff +; NOBF16-NEXT: fcvtx z3.s, p0/m, z3.d +; NOBF16-NEXT: fcvtx z2.s, p0/m, z2.d +; NOBF16-NEXT: fcvtx z1.s, p0/m, z1.d +; NOBF16-NEXT: fcvtx z0.s, p0/m, z0.d +; NOBF16-NEXT: lsr z5.s, z3.s, #16 +; NOBF16-NEXT: lsr z6.s, z2.s, #16 +; NOBF16-NEXT: lsr z7.s, z1.s, #16 +; NOBF16-NEXT: lsr z24.s, z0.s, #16 +; NOBF16-NEXT: add z25.s, z3.s, z4.s +; NOBF16-NEXT: add z26.s, z2.s, z4.s +; NOBF16-NEXT: add z27.s, z1.s, z4.s +; NOBF16-NEXT: add z4.s, z0.s, z4.s +; NOBF16-NEXT: fcmuo p1.s, p0/z, z3.s, z3.s +; NOBF16-NEXT: and z5.s, z5.s, #0x1 +; NOBF16-NEXT: and z6.s, z6.s, #0x1 +; NOBF16-NEXT: and z7.s, z7.s, #0x1 +; NOBF16-NEXT: and z24.s, z24.s, #0x1 +; NOBF16-NEXT: fcmuo p2.s, p0/z, z2.s, z2.s +; NOBF16-NEXT: fcmuo p3.s, p0/z, z1.s, z1.s +; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s +; NOBF16-NEXT: orr z3.s, z3.s, #0x400000 +; NOBF16-NEXT: orr z2.s, z2.s, #0x400000 +; NOBF16-NEXT: add z5.s, z5.s, z25.s +; NOBF16-NEXT: add z6.s, z6.s, z26.s +; NOBF16-NEXT: add z7.s, z7.s, z27.s +; NOBF16-NEXT: add z4.s, z24.s, z4.s +; NOBF16-NEXT: orr z1.s, z1.s, #0x400000 +; NOBF16-NEXT: orr z0.s, z0.s, #0x400000 +; NOBF16-NEXT: sel z3.s, p1, z3.s, z5.s +; NOBF16-NEXT: sel z2.s, p2, z2.s, z6.s +; NOBF16-NEXT: sel z1.s, p3, z1.s, z7.s +; NOBF16-NEXT: sel z0.s, p0, z0.s, z4.s +; NOBF16-NEXT: lsr z3.s, z3.s, #16 +; NOBF16-NEXT: lsr z2.s, z2.s, #16 +; NOBF16-NEXT: lsr z1.s, z1.s, #16 +; NOBF16-NEXT: lsr z0.s, z0.s, #16 +; NOBF16-NEXT: uzp1 z2.s, z2.s, z3.s +; NOBF16-NEXT: uzp1 z0.s, z0.s, z1.s +; NOBF16-NEXT: uzp1 z0.h, z0.h, z2.h +; NOBF16-NEXT: ret +; +; NOBF16NNAN-LABEL: fptrunc_nxv8f64_to_nxv8bf16: +; NOBF16NNAN: // %bb.0: +; NOBF16NNAN-NEXT: ptrue p0.d +; NOBF16NNAN-NEXT: mov z4.s, #32767 // =0x7fff +; NOBF16NNAN-NEXT: fcvtx z3.s, p0/m, z3.d +; NOBF16NNAN-NEXT: fcvtx z2.s, p0/m, z2.d +; NOBF16NNAN-NEXT: fcvtx z1.s, p0/m, z1.d +; NOBF16NNAN-NEXT: fcvtx z0.s, p0/m, z0.d +; NOBF16NNAN-NEXT: lsr z5.s, z3.s, #16 +; NOBF16NNAN-NEXT: lsr z6.s, z2.s, #16 +; NOBF16NNAN-NEXT: lsr z7.s, z1.s, #16 +; NOBF16NNAN-NEXT: lsr z24.s, z0.s, #16 +; NOBF16NNAN-NEXT: add z3.s, z3.s, z4.s +; NOBF16NNAN-NEXT: add z2.s, z2.s, z4.s +; NOBF16NNAN-NEXT: add z1.s, z1.s, z4.s +; NOBF16NNAN-NEXT: add z0.s, z0.s, z4.s +; NOBF16NNAN-NEXT: and z5.s, z5.s, #0x1 +; NOBF16NNAN-NEXT: and z6.s, z6.s, #0x1 +; NOBF16NNAN-NEXT: and z7.s, z7.s, #0x1 +; NOBF16NNAN-NEXT: and z24.s, z24.s, #0x1 +; NOBF16NNAN-NEXT: add z3.s, z5.s, z3.s +; NOBF16NNAN-NEXT: add z2.s, z6.s, z2.s +; NOBF16NNAN-NEXT: add z1.s, z7.s, z1.s +; NOBF16NNAN-NEXT: add z0.s, z24.s, z0.s +; NOBF16NNAN-NEXT: lsr z3.s, z3.s, #16 +; NOBF16NNAN-NEXT: lsr z2.s, z2.s, #16 +; NOBF16NNAN-NEXT: lsr z1.s, z1.s, #16 +; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16 +; NOBF16NNAN-NEXT: uzp1 z2.s, z2.s, z3.s +; NOBF16NNAN-NEXT: uzp1 z0.s, z0.s, z1.s +; NOBF16NNAN-NEXT: uzp1 z0.h, z0.h, z2.h +; NOBF16NNAN-NEXT: ret +; +; BF16-LABEL: fptrunc_nxv8f64_to_nxv8bf16: +; BF16: // %bb.0: +; BF16-NEXT: ptrue p0.d +; BF16-NEXT: fcvtx z3.s, p0/m, z3.d +; BF16-NEXT: fcvtx z2.s, p0/m, z2.d +; BF16-NEXT: fcvtx z1.s, p0/m, z1.d +; BF16-NEXT: fcvtx z0.s, p0/m, z0.d +; BF16-NEXT: bfcvt z3.h, p0/m, z3.s +; BF16-NEXT: bfcvt z2.h, p0/m, z2.s +; BF16-NEXT: bfcvt z1.h, p0/m, z1.s +; BF16-NEXT: bfcvt z0.h, p0/m, z0.s +; BF16-NEXT: uzp1 z2.s, z2.s, z3.s +; BF16-NEXT: uzp1 z0.s, z0.s, z1.s +; BF16-NEXT: uzp1 z0.h, z0.h, z2.h +; BF16-NEXT: ret + %res = fptrunc %a to + ret %res +}