Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23942,6 +23942,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
Store->getMemOperand());
}

// Combine store (fp_to_int X) to use vector semantics around the conversion
// when NEON is available. This allows us to store the in-vector result directly
// without transferring the result into a GPR in the process.
static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
TargetLowering::DAGCombinerInfo &DCI,
SelectionDAG &DAG,
const AArch64Subtarget *Subtarget) {
// Limit to post-legalization in order to avoid peeling truncating stores.
if (DCI.isBeforeLegalize())
return SDValue();
if (!Subtarget->isNeonAvailable())
return SDValue();
// Source operand is already a vector.
SDValue Value = ST->getValue();
if (Value.getValueType().isVector())
return SDValue();

// Look through potential assertions.
while (Value->isAssert())
Value = Value.getOperand(0);

if (Value.getOpcode() != ISD::FP_TO_SINT &&
Value.getOpcode() != ISD::FP_TO_UINT)
return SDValue();
if (!Value->hasOneUse())
return SDValue();

SDValue FPSrc = Value.getOperand(0);
EVT SrcVT = FPSrc.getValueType();
if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
return SDValue();

// No support for assignments such as i64 = fp_to_sint i32
EVT VT = Value.getSimpleValueType();
if (VT != SrcVT.changeTypeToInteger())
return SDValue();

// Create a 128-bit element vector to avoid widening. The floating point
// conversion is transformed into a single element conversion via a pattern.
unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
EVT VecDstVT = VecSrcVT.changeTypeToInteger();
SDLoc DL(ST);
SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);

SDValue Zero = DAG.getVectorIdxConstant(0, DL);
SDValue Extracted =
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);

DCI.CombineTo(ST->getValue().getNode(), Extracted);
return SDValue(ST, 0);
}

bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
Expand Down Expand Up @@ -24024,6 +24078,9 @@ static SDValue performSTORECombine(SDNode *N,
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
SDLoc DL(ST);

if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
return Res;

auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
EVT EltVT = VT.getVectorElementType();
return EltVT == MVT::f32 || EltVT == MVT::f64;
Expand Down
9 changes: 9 additions & 0 deletions llvm/lib/Target/AArch64/AArch64InstrInfo.td
Original file line number Diff line number Diff line change
Expand Up @@ -6646,6 +6646,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
}

def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;

// int -> float conversion of value in lane 0 of simd vector should use
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),
Expand Down
4 changes: 2 additions & 2 deletions llvm/test/CodeGen/AArch64/selectopt-const.ll
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
; CHECK-NEXT: csel x10, x9, xzr, lt
; CHECK-NEXT: subs x8, x8, #1
; CHECK-NEXT: ldr s3, [x4, x10]
; CHECK-NEXT: fcvtzs w10, s3
; CHECK-NEXT: str w10, [x2], #4
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
; CHECK-NEXT: b.ne .LBB0_2
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
; CHECK-NEXT: mov w0, wzr
Expand Down
131 changes: 131 additions & 0 deletions llvm/test/CodeGen/AArch64/store-float-conversion.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s

define void @f32_to_u8(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu s0, s0
; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui float %f to i32
%trunc = trunc i32 %conv to i8
store i8 %trunc, ptr %dst
ret void
}

define void @f32_to_s8(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s8:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs s0, s0
; CHECK-NEXT: str b0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
%trunc = trunc i32 %conv to i8
store i8 %trunc, ptr %dst
ret void
}

define void @f32_to_u16(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu s0, s0
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui float %f to i32
%trunc = trunc i32 %conv to i16
store i16 %trunc, ptr %dst
ret void
}

define void @f32_to_s16(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s16:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs s0, s0
; CHECK-NEXT: str h0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
%trunc = trunc i32 %conv to i16
store i16 %trunc, ptr %dst
ret void
}

define void @f32_to_u32(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_u32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu s0, s0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui float %f to i32
store i32 %conv, ptr %dst
ret void
}

define void @f32_to_s32(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s32:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs s0, s0
; CHECK-NEXT: str s0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
store i32 %conv, ptr %dst
ret void
}

define void @f32_to_s64(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_s64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: sxtw x8, w8
; CHECK-NEXT: str x8, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
%ext = sext i32 %conv to i64
store i64 %ext, ptr %dst
ret void
}

define void @f64_to_u64(double %d, ptr %dst) {
; CHECK-LABEL: f64_to_u64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzu d0, d0
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptoui double %d to i64
store i64 %conv, ptr %dst
ret void
}

define void @f64_to_s64(double %d, ptr %dst) {
; CHECK-LABEL: f64_to_s64:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs d0, d0
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret
entry:
%conv = fptosi double %d to i64
store i64 %conv, ptr %dst
ret void
}

define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
; CHECK-LABEL: f32_to_i32_multiple_uses:
; CHECK: // %bb.0: // %entry
; CHECK-NEXT: fcvtzs w8, s0
; CHECK-NEXT: mov x9, x0
; CHECK-NEXT: mov w0, w8
; CHECK-NEXT: strb w8, [x9]
; CHECK-NEXT: ret
entry:
%conv = fptosi float %f to i32
%trunc = trunc i32 %conv to i8
store i8 %trunc, ptr %dst
ret i32 %conv
}
41 changes: 21 additions & 20 deletions llvm/test/CodeGen/AArch64/tbl-loops.ll
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s2, #0.0
; CHECK-NEXT: fcsel s2, s0, s3, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: fmov w11, s2
; CHECK-NEXT: strb w11, [x9], #1
; CHECK-NEXT: b.ne .LBB0_7
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
Expand Down Expand Up @@ -178,12 +179,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: fcsel s3, s0, s4, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: strb w11, [x9]
; CHECK-NEXT: fcvtzs w12, s3
; CHECK-NEXT: strb w12, [x9, #1]
; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: stur b3, [x9, #1]
; CHECK-NEXT: add x9, x9, #2
; CHECK-NEXT: b.ne .LBB1_6
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
Expand Down Expand Up @@ -395,19 +396,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: ldr s3, [x8, #8]
; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: add x8, x8, #12
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: strb w11, [x9]
; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcsel s5, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcvtzs w12, s4
; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s3, s0, s5, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: strb w12, [x9, #1]
; CHECK-NEXT: fcvtzs w13, s3
; CHECK-NEXT: strb w13, [x9, #2]
; CHECK-NEXT: stur b4, [x9, #1]
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: stur b3, [x9, #2]
; CHECK-NEXT: add x9, x9, #3
; CHECK-NEXT: b.ne .LBB2_8
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
Expand Down Expand Up @@ -563,26 +564,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: fcsel s4, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcvtzs w11, s2
; CHECK-NEXT: fcvtzs s2, s2
; CHECK-NEXT: ldp s3, s5, [x8, #8]
; CHECK-NEXT: add x8, x8, #16
; CHECK-NEXT: fcsel s4, s0, s4, mi
; CHECK-NEXT: fcmp s3, s1
; CHECK-NEXT: strb w11, [x9]
; CHECK-NEXT: fcvtzs w12, s4
; CHECK-NEXT: str b2, [x9]
; CHECK-NEXT: fcvtzs s4, s4
; CHECK-NEXT: fcsel s6, s1, s3, gt
; CHECK-NEXT: fcmp s3, #0.0
; CHECK-NEXT: fcsel s3, s0, s6, mi
; CHECK-NEXT: fcmp s5, s1
; CHECK-NEXT: strb w12, [x9, #1]
; CHECK-NEXT: stur b4, [x9, #1]
; CHECK-NEXT: fcsel s6, s1, s5, gt
; CHECK-NEXT: fcmp s5, #0.0
; CHECK-NEXT: fcvtzs w13, s3
; CHECK-NEXT: fcsel s2, s0, s6, mi
; CHECK-NEXT: fcvtzs s3, s3
; CHECK-NEXT: fcsel s5, s0, s6, mi
; CHECK-NEXT: subs w10, w10, #1
; CHECK-NEXT: strb w13, [x9, #2]
; CHECK-NEXT: fcvtzs w14, s2
; CHECK-NEXT: strb w14, [x9, #3]
; CHECK-NEXT: stur b3, [x9, #2]
; CHECK-NEXT: fcvtzs s5, s5
; CHECK-NEXT: stur b5, [x9, #3]
; CHECK-NEXT: add x9, x9, #4
; CHECK-NEXT: b.ne .LBB3_6
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup
Expand Down