-
Notifications
You must be signed in to change notification settings - Fork 15.3k
[LLVM][SVE] Implement isel for fptoi half/float/double to i1. #129269
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
Also adds an assert that SVE support for strict_fp fp<->int operations is missing. The added costs are to maintain the existing values expected by Analysis/CostModel/AArch64/sve-cast.ll. NOTE: This PR omits bfloat support because it is broken for all result types. This will be fixed in a follow-up PR.
|
@llvm/pr-subscribers-backend-aarch64 Author: Paul Walker (paulwalker-arm) ChangesAlso adds an assert that SVE support for strict_fp fp<->int operations is missing. The added costs are to maintain the existing values expected by Analysis/CostModel/AArch64/sve-cast.ll. NOTE: This PR omits bfloat support because it is broken for all result types. This will be fixed in a follow-up PR. Full diff: https://github.com/llvm/llvm-project/pull/129269.diff 3 Files Affected:
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7a471662ea075..aab55bf725d98 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1567,6 +1567,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
// There are no legal MVT::nxv16f## based types.
if (VT != MVT::nxv16i1) {
+ setOperationAction(ISD::FP_TO_SINT, VT, Custom);
+ setOperationAction(ISD::FP_TO_UINT, VT, Custom);
setOperationAction(ISD::SINT_TO_FP, VT, Custom);
setOperationAction(ISD::UINT_TO_FP, VT, Custom);
}
@@ -4726,7 +4728,18 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
EVT InVT = Op.getOperand(IsStrict ? 1 : 0).getValueType();
EVT VT = Op.getValueType();
+ assert(!(IsStrict && VT.isScalableVector()) &&
+ "Unimplemented SVE support for STRICT_FP_to_INT!");
+
if (VT.isScalableVector()) {
+ if (VT.getVectorElementType() == MVT::i1) {
+ SDLoc DL(Op);
+ EVT CvtVT = getPromotedVTForPredicate(VT);
+ SDValue Cvt = DAG.getNode(Op.getOpcode(), DL, CvtVT, Op.getOperand(0));
+ SDValue Zero = DAG.getConstant(0, DL, CvtVT);
+ return DAG.getSetCC(DL, VT, Cvt, Zero, ISD::SETNE);
+ }
+
unsigned Opcode = Op.getOpcode() == ISD::FP_TO_UINT
? AArch64ISD::FCVTZU_MERGE_PASSTHRU
: AArch64ISD::FCVTZS_MERGE_PASSTHRU;
@@ -5032,6 +5045,9 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
unsigned Opc = Op.getOpcode();
bool IsSigned = Opc == ISD::SINT_TO_FP || Opc == ISD::STRICT_SINT_TO_FP;
+ assert(!(IsStrict && VT.isScalableVector()) &&
+ "Unimplemented SVE support for ISD:::STRICT_INT_TO_FP!");
+
if (VT.isScalableVector()) {
if (InVT.getVectorElementType() == MVT::i1) {
// We can't directly extend an SVE predicate; extend it first.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 1008be32e5bfa..670bb98988297 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -3017,20 +3017,24 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_TO_SINT, MVT::nxv2i32, MVT::nxv2f64, 1},
{ISD::FP_TO_SINT, MVT::nxv2i16, MVT::nxv2f64, 1},
{ISD::FP_TO_SINT, MVT::nxv2i8, MVT::nxv2f64, 1},
+ {ISD::FP_TO_SINT, MVT::nxv2i1, MVT::nxv2f64, 1},
{ISD::FP_TO_UINT, MVT::nxv2i64, MVT::nxv2f64, 1},
{ISD::FP_TO_UINT, MVT::nxv2i32, MVT::nxv2f64, 1},
{ISD::FP_TO_UINT, MVT::nxv2i16, MVT::nxv2f64, 1},
{ISD::FP_TO_UINT, MVT::nxv2i8, MVT::nxv2f64, 1},
+ {ISD::FP_TO_UINT, MVT::nxv2i1, MVT::nxv2f64, 1},
// Complex, from nxv4f32.
{ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f32, 4},
{ISD::FP_TO_SINT, MVT::nxv4i32, MVT::nxv4f32, 1},
{ISD::FP_TO_SINT, MVT::nxv4i16, MVT::nxv4f32, 1},
{ISD::FP_TO_SINT, MVT::nxv4i8, MVT::nxv4f32, 1},
+ {ISD::FP_TO_SINT, MVT::nxv4i1, MVT::nxv4f32, 1},
{ISD::FP_TO_UINT, MVT::nxv4i64, MVT::nxv4f32, 4},
{ISD::FP_TO_UINT, MVT::nxv4i32, MVT::nxv4f32, 1},
{ISD::FP_TO_UINT, MVT::nxv4i16, MVT::nxv4f32, 1},
{ISD::FP_TO_UINT, MVT::nxv4i8, MVT::nxv4f32, 1},
+ {ISD::FP_TO_UINT, MVT::nxv4i1, MVT::nxv4f32, 1},
// Complex, from nxv8f64. Illegal -> illegal conversions not required.
{ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f64, 7},
@@ -3057,10 +3061,12 @@ InstructionCost AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
{ISD::FP_TO_SINT, MVT::nxv8i32, MVT::nxv8f16, 4},
{ISD::FP_TO_SINT, MVT::nxv8i16, MVT::nxv8f16, 1},
{ISD::FP_TO_SINT, MVT::nxv8i8, MVT::nxv8f16, 1},
+ {ISD::FP_TO_SINT, MVT::nxv8i1, MVT::nxv8f16, 1},
{ISD::FP_TO_UINT, MVT::nxv8i64, MVT::nxv8f16, 10},
{ISD::FP_TO_UINT, MVT::nxv8i32, MVT::nxv8f16, 4},
{ISD::FP_TO_UINT, MVT::nxv8i16, MVT::nxv8f16, 1},
{ISD::FP_TO_UINT, MVT::nxv8i8, MVT::nxv8f16, 1},
+ {ISD::FP_TO_UINT, MVT::nxv8i1, MVT::nxv8f16, 1},
// Complex, from nxv4f16.
{ISD::FP_TO_SINT, MVT::nxv4i64, MVT::nxv4f16, 4},
diff --git a/llvm/test/CodeGen/AArch64/sve-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
index fc5128fffad36..1d000e4fd5d53 100644
--- a/llvm/test/CodeGen/AArch64/sve-fcvt.ll
+++ b/llvm/test/CodeGen/AArch64/sve-fcvt.ll
@@ -113,6 +113,120 @@ define <vscale x 2 x float> @fcvts_nxv2f64(<vscale x 2 x double> %a) {
; FP_TO_SINT
;
+define <vscale x 2 x i1> @fcvtzs_nxv2f16_to_nxv2i1(<vscale x 2 x half> %a) {
+; CHECK-LABEL: fcvtzs_nxv2f16_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.h
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x half> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 2 x i1> @fcvtzs_nxv2f32_to_nxv2i1(<vscale x 2 x float> %a) {
+; CHECK-LABEL: fcvtzs_nxv2f32_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x float> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 2 x i1> @fcvtzs_nxv2f64_to_nxv2i1(<vscale x 2 x double> %a) {
+; CHECK-LABEL: fcvtzs_nxv2f64_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 2 x double> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 4 x i1> @fcvtzs_nxv4f16_to_nxv4i1(<vscale x 4 x half> %a) {
+; CHECK-LABEL: fcvtzs_nxv4f16_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.h
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x half> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i1> @fcvtzs_nxv4f32_to_nxv4i1(<vscale x 4 x float> %a) {
+; CHECK-LABEL: fcvtzs_nxv4f32_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x float> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i1> @fcvtzs_nxv4f64_to_nxv4i1(<vscale x 4 x double> %a) {
+; CHECK-LABEL: fcvtzs_nxv4f64_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 4 x double> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 8 x i1> @fcvtzs_nxv8f16_to_nxv8i1(<vscale x 8 x half> %a) {
+; CHECK-LABEL: fcvtzs_nxv8f16_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fcvtzs z0.h, p0/m, z0.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x half> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i1> @fcvtzs_nxv8f32_to_nxv8i1(<vscale x 8 x float> %a) {
+; CHECK-LABEL: fcvtzs_nxv8f32_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzs z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzs z0.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x float> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i1> @fcvtzs_nxv8f64_to_nxv8i1(<vscale x 8 x double> %a) {
+; CHECK-LABEL: fcvtzs_nxv8f64_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzs z3.d, p0/m, z3.d
+; CHECK-NEXT: fcvtzs z2.d, p0/m, z2.d
+; CHECK-NEXT: fcvtzs z1.d, p0/m, z1.d
+; CHECK-NEXT: fcvtzs z0.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0
+; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0
+; CHECK-NEXT: cmpne p3.d, p0/z, z1.d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s
+; CHECK-NEXT: uzp1 p0.s, p0.s, p3.s
+; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT: ret
+ %res = fptosi <vscale x 8 x double> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
define <vscale x 2 x i16> @fcvtzs_h_nxv2f16(<vscale x 2 x half> %a) {
; CHECK-LABEL: fcvtzs_h_nxv2f16:
; CHECK: // %bb.0:
@@ -277,6 +391,120 @@ define <vscale x 2 x i64> @fcvtzs_d_nxv2f64(<vscale x 2 x double> %a) {
; FP_TO_UINT
;
+define <vscale x 2 x i1> @fcvtzu_nxv2f16_to_nxv2i1(<vscale x 2 x half> %a) {
+; CHECK-LABEL: fcvtzu_nxv2f16_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.h
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x half> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 2 x i1> @fcvtzu_nxv2f32_to_nxv2i1(<vscale x 2 x float> %a) {
+; CHECK-LABEL: fcvtzu_nxv2f32_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x float> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 2 x i1> @fcvtzu_nxv2f64_to_nxv2i1(<vscale x 2 x double> %a) {
+; CHECK-LABEL: fcvtzu_nxv2f64_to_nxv2i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 2 x double> %a to <vscale x 2 x i1>
+ ret <vscale x 2 x i1> %res
+}
+
+define <vscale x 4 x i1> @fcvtzu_nxv4f16_to_nxv4i1(<vscale x 4 x half> %a) {
+; CHECK-LABEL: fcvtzu_nxv4f16_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.h
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x half> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i1> @fcvtzu_nxv4f32_to_nxv4i1(<vscale x 4 x float> %a) {
+; CHECK-LABEL: fcvtzu_nxv4f32_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x float> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 4 x i1> @fcvtzu_nxv4f64_to_nxv4i1(<vscale x 4 x double> %a) {
+; CHECK-LABEL: fcvtzu_nxv4f64_to_nxv4i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p1.d, p0/z, z1.d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 4 x double> %a to <vscale x 4 x i1>
+ ret <vscale x 4 x i1> %res
+}
+
+define <vscale x 8 x i1> @fcvtzu_nxv8f16_to_nxv8i1(<vscale x 8 x half> %a) {
+; CHECK-LABEL: fcvtzu_nxv8f16_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: fcvtzu z0.h, p0/m, z0.h
+; CHECK-NEXT: cmpne p0.h, p0/z, z0.h, #0
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x half> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i1> @fcvtzu_nxv8f32_to_nxv8i1(<vscale x 8 x float> %a) {
+; CHECK-LABEL: fcvtzu_nxv8f32_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: fcvtzu z1.s, p0/m, z1.s
+; CHECK-NEXT: fcvtzu z0.s, p0/m, z0.s
+; CHECK-NEXT: cmpne p1.s, p0/z, z1.s, #0
+; CHECK-NEXT: cmpne p0.s, p0/z, z0.s, #0
+; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x float> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
+define <vscale x 8 x i1> @fcvtzu_nxv8f64_to_nxv8i1(<vscale x 8 x double> %a) {
+; CHECK-LABEL: fcvtzu_nxv8f64_to_nxv8i1:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: fcvtzu z3.d, p0/m, z3.d
+; CHECK-NEXT: fcvtzu z2.d, p0/m, z2.d
+; CHECK-NEXT: fcvtzu z1.d, p0/m, z1.d
+; CHECK-NEXT: fcvtzu z0.d, p0/m, z0.d
+; CHECK-NEXT: cmpne p1.d, p0/z, z3.d, #0
+; CHECK-NEXT: cmpne p2.d, p0/z, z2.d, #0
+; CHECK-NEXT: cmpne p3.d, p0/z, z1.d, #0
+; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0
+; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s
+; CHECK-NEXT: uzp1 p0.s, p0.s, p3.s
+; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h
+; CHECK-NEXT: ret
+ %res = fptoui <vscale x 8 x double> %a to <vscale x 8 x i1>
+ ret <vscale x 8 x i1> %res
+}
+
; NOTE: Using fcvtzs is safe as fptoui overflow is considered poison and a
; 64bit signed value encompasses the entire range of a 16bit unsigned value
define <vscale x 2 x i16> @fcvtzu_h_nxv2f16(<vscale x 2 x half> %a) {
|
david-arm
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM! The costs for the operations may be underestimated due to the conversion process, but the chance of actually seeing conversions from FP->i1 being vectorised in loops seems very low. We can always revisit this in future if it's a problem.
Also adds an assert that SVE support for strict_fp fp<->int operations is missing.
The added costs are to maintain the existing values expected by Analysis/CostModel/AArch64/sve-cast.ll.
NOTE: This PR omits bfloat support because it is broken for all result types. This will be fixed in a follow-up PR.