Skip to content

Commit 2d33584

Browse files
committed
[AArch64] Keep floating-point conversion in SIMD
1 parent 114d74e commit 2d33584

File tree

5 files changed

+210
-23
lines changed

5 files changed

+210
-23
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24026,6 +24026,63 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
2402624026
Store->getMemOperand());
2402724027
}
2402824028

24029+
// Combine store (fp_to_int X) with optional extensions/trunctions to use vector
24030+
// semantics when NEON is available.
24031+
static void combineFPToIntStore(StoreSDNode *ST,
24032+
TargetLowering::DAGCombinerInfo &DCI,
24033+
SelectionDAG &DAG,
24034+
const AArch64Subtarget *Subtarget) {
24035+
if (!Subtarget->isNeonAvailable())
24036+
return;
24037+
24038+
SDValue Value = ST->getValue();
24039+
// Peel extensions, truncations and assertions.
24040+
for (;;) {
24041+
if (!Value->hasOneUse())
24042+
break;
24043+
if (!ISD::isExtOpcode(Value.getOpcode()) &&
24044+
Value.getOpcode() != ISD::TRUNCATE && !Value->isAssert())
24045+
break;
24046+
Value = Value.getOperand(0);
24047+
}
24048+
24049+
if (Value.getOpcode() != ISD::FP_TO_UINT &&
24050+
Value.getOpcode() != ISD::FP_TO_SINT)
24051+
return;
24052+
if (!Value.hasOneUse())
24053+
return;
24054+
24055+
SDValue FPSrc = Value.getOperand(0);
24056+
EVT SrcVT = FPSrc.getValueType();
24057+
if (SrcVT.isVector())
24058+
return;
24059+
24060+
// Create a two-element vector to avoid widening. The floating point
24061+
// conversion is transformed into a single element conversion via a pattern.
24062+
EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, 2);
24063+
EVT DstVT = MVT::getIntegerVT(SrcVT.getScalarSizeInBits());
24064+
EVT VecDstVT = EVT::getVectorVT(*DAG.getContext(), DstVT, 2);
24065+
SDLoc DL(ST);
24066+
SDValue UndefVec = DAG.getUNDEF(VecSrcVT);
24067+
SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
24068+
SDValue VecFP =
24069+
DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VecSrcVT, UndefVec, FPSrc, Zero);
24070+
24071+
SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24072+
24073+
if (Value.getValueSizeInBits() != DstVT.getSizeInBits()) {
24074+
EVT NewVecDstVT = EVT::getVectorVT(*DAG.getContext(), Value.getValueType(),
24075+
VecDstVT.getFixedSizeInBits() /
24076+
Value.getScalarValueSizeInBits());
24077+
VecConv = DAG.getNode(ISD::BITCAST, DL, NewVecDstVT, VecConv);
24078+
}
24079+
24080+
SDValue Extracted = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
24081+
Value.getValueType(), VecConv, Zero);
24082+
24083+
DCI.CombineTo(Value.getNode(), Extracted);
24084+
}
24085+
2402924086
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
2403024087
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
2403124088
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24108,6 +24165,8 @@ static SDValue performSTORECombine(SDNode *N,
2410824165
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2410924166
SDLoc DL(ST);
2411024167

24168+
combineFPToIntStore(ST, DCI, DAG, Subtarget);
24169+
2411124170
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
2411224171
EVT EltVT = VT.getVectorElementType();
2411324172
return EltVT == MVT::f32 || EltVT == MVT::f64;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6627,6 +6627,17 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
66276627
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
66286628
}
66296629

6630+
let Predicates = [HasNEONandIsSME2p2StreamingSafe] in {
6631+
def : Pat<(v2i32 (fp_to_uint (v2f32 (scalar_to_vector (f32 FPR32:$src))))),
6632+
(v2i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
6633+
def : Pat<(v2i32 (fp_to_sint (v2f32 (scalar_to_vector (f32 FPR32:$src))))),
6634+
(v2i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
6635+
def : Pat<(v2i64 (fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6636+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
6637+
def : Pat<(v2i64 (fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6638+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
6639+
}
6640+
66306641
// int -> float conversion of value in lane 0 of simd vector should use
66316642
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
66326643
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),

llvm/test/CodeGen/AArch64/selectopt-const.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
2929
; CHECK-NEXT: csel x10, x9, xzr, lt
3030
; CHECK-NEXT: subs x8, x8, #1
3131
; CHECK-NEXT: ldr s3, [x4, x10]
32-
; CHECK-NEXT: fcvtzs w10, s3
33-
; CHECK-NEXT: str w10, [x2], #4
32+
; CHECK-NEXT: fcvtzs s3, s3
33+
; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
3434
; CHECK-NEXT: b.ne .LBB0_2
3535
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
3636
; CHECK-NEXT: mov w0, wzr
Lines changed: 117 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,117 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define void @f32_to_u8(float %f, ptr %dst) {
5+
; CHECK-LABEL: f32_to_u8:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: fcvtzu s0, s0
8+
; CHECK-NEXT: str b0, [x0]
9+
; CHECK-NEXT: ret
10+
entry:
11+
%conv = fptoui float %f to i32
12+
%trunc = trunc i32 %conv to i8
13+
store i8 %trunc, ptr %dst
14+
ret void
15+
}
16+
17+
define void @f32_to_s8(float %f, ptr %dst) {
18+
; CHECK-LABEL: f32_to_s8:
19+
; CHECK: // %bb.0: // %entry
20+
; CHECK-NEXT: fcvtzs s0, s0
21+
; CHECK-NEXT: str b0, [x0]
22+
; CHECK-NEXT: ret
23+
entry:
24+
%conv = fptosi float %f to i32
25+
%trunc = trunc i32 %conv to i8
26+
store i8 %trunc, ptr %dst
27+
ret void
28+
}
29+
30+
define void @f32_to_u16(float %f, ptr %dst) {
31+
; CHECK-LABEL: f32_to_u16:
32+
; CHECK: // %bb.0: // %entry
33+
; CHECK-NEXT: fcvtzu s0, s0
34+
; CHECK-NEXT: str h0, [x0]
35+
; CHECK-NEXT: ret
36+
entry:
37+
%conv = fptoui float %f to i32
38+
%trunc = trunc i32 %conv to i16
39+
store i16 %trunc, ptr %dst
40+
ret void
41+
}
42+
43+
define void @f32_to_s16(float %f, ptr %dst) {
44+
; CHECK-LABEL: f32_to_s16:
45+
; CHECK: // %bb.0: // %entry
46+
; CHECK-NEXT: fcvtzs s0, s0
47+
; CHECK-NEXT: str h0, [x0]
48+
; CHECK-NEXT: ret
49+
entry:
50+
%conv = fptosi float %f to i32
51+
%trunc = trunc i32 %conv to i16
52+
store i16 %trunc, ptr %dst
53+
ret void
54+
}
55+
56+
define void @f32_to_u32(float %f, ptr %dst) {
57+
; CHECK-LABEL: f32_to_u32:
58+
; CHECK: // %bb.0: // %entry
59+
; CHECK-NEXT: fcvtzu s0, s0
60+
; CHECK-NEXT: str s0, [x0]
61+
; CHECK-NEXT: ret
62+
entry:
63+
%conv = fptoui float %f to i32
64+
store i32 %conv, ptr %dst
65+
ret void
66+
}
67+
68+
define void @f32_to_s32(float %f, ptr %dst) {
69+
; CHECK-LABEL: f32_to_s32:
70+
; CHECK: // %bb.0: // %entry
71+
; CHECK-NEXT: fcvtzs s0, s0
72+
; CHECK-NEXT: str s0, [x0]
73+
; CHECK-NEXT: ret
74+
entry:
75+
%conv = fptosi float %f to i32
76+
store i32 %conv, ptr %dst
77+
ret void
78+
}
79+
80+
define void @f64_to_u64(double %d, ptr %dst) {
81+
; CHECK-LABEL: f64_to_u64:
82+
; CHECK: // %bb.0: // %entry
83+
; CHECK-NEXT: fcvtzu d0, d0
84+
; CHECK-NEXT: str d0, [x0]
85+
; CHECK-NEXT: ret
86+
entry:
87+
%conv = fptoui double %d to i64
88+
store i64 %conv, ptr %dst
89+
ret void
90+
}
91+
92+
define void @f64_to_s64(double %d, ptr %dst) {
93+
; CHECK-LABEL: f64_to_s64:
94+
; CHECK: // %bb.0: // %entry
95+
; CHECK-NEXT: fcvtzs d0, d0
96+
; CHECK-NEXT: str d0, [x0]
97+
; CHECK-NEXT: ret
98+
entry:
99+
%conv = fptosi double %d to i64
100+
store i64 %conv, ptr %dst
101+
ret void
102+
}
103+
104+
define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
105+
; CHECK-LABEL: f32_to_i32_multiple_uses:
106+
; CHECK: // %bb.0: // %entry
107+
; CHECK-NEXT: fcvtzs w8, s0
108+
; CHECK-NEXT: mov x9, x0
109+
; CHECK-NEXT: mov w0, w8
110+
; CHECK-NEXT: strb w8, [x9]
111+
; CHECK-NEXT: ret
112+
entry:
113+
%conv = fptosi float %f to i32
114+
%trunc = trunc i32 %conv to i8
115+
store i8 %trunc, ptr %dst
116+
ret i32 %conv
117+
}

llvm/test/CodeGen/AArch64/tbl-loops.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
6363
; CHECK-NEXT: fcmp s2, #0.0
6464
; CHECK-NEXT: fcsel s2, s0, s3, mi
6565
; CHECK-NEXT: subs w10, w10, #1
66-
; CHECK-NEXT: fcvtzs w11, s2
67-
; CHECK-NEXT: strb w11, [x9], #1
66+
; CHECK-NEXT: fcvtzu s2, s2
67+
; CHECK-NEXT: st1 { v2.b }[0], [x9], #1
6868
; CHECK-NEXT: b.ne .LBB0_7
6969
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
7070
; CHECK-NEXT: ret
@@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
178178
; CHECK-NEXT: fcmp s3, s1
179179
; CHECK-NEXT: fcsel s4, s1, s3, gt
180180
; CHECK-NEXT: fcmp s3, #0.0
181-
; CHECK-NEXT: fcvtzs w11, s2
181+
; CHECK-NEXT: fcvtzu s2, s2
182182
; CHECK-NEXT: fcsel s3, s0, s4, mi
183183
; CHECK-NEXT: subs w10, w10, #1
184-
; CHECK-NEXT: strb w11, [x9]
185-
; CHECK-NEXT: fcvtzs w12, s3
186-
; CHECK-NEXT: strb w12, [x9, #1]
184+
; CHECK-NEXT: str b2, [x9]
185+
; CHECK-NEXT: fcvtzu s3, s3
186+
; CHECK-NEXT: stur b3, [x9, #1]
187187
; CHECK-NEXT: add x9, x9, #2
188188
; CHECK-NEXT: b.ne .LBB1_6
189189
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
395395
; CHECK-NEXT: fcsel s4, s1, s3, gt
396396
; CHECK-NEXT: fcmp s3, #0.0
397397
; CHECK-NEXT: ldr s3, [x8, #8]
398-
; CHECK-NEXT: fcvtzs w11, s2
398+
; CHECK-NEXT: fcvtzu s2, s2
399399
; CHECK-NEXT: add x8, x8, #12
400400
; CHECK-NEXT: fcsel s4, s0, s4, mi
401401
; CHECK-NEXT: fcmp s3, s1
402-
; CHECK-NEXT: strb w11, [x9]
402+
; CHECK-NEXT: str b2, [x9]
403403
; CHECK-NEXT: fcsel s5, s1, s3, gt
404404
; CHECK-NEXT: fcmp s3, #0.0
405-
; CHECK-NEXT: fcvtzs w12, s4
405+
; CHECK-NEXT: fcvtzu s4, s4
406406
; CHECK-NEXT: fcsel s3, s0, s5, mi
407407
; CHECK-NEXT: subs w10, w10, #1
408-
; CHECK-NEXT: strb w12, [x9, #1]
409-
; CHECK-NEXT: fcvtzs w13, s3
410-
; CHECK-NEXT: strb w13, [x9, #2]
408+
; CHECK-NEXT: stur b4, [x9, #1]
409+
; CHECK-NEXT: fcvtzu s3, s3
410+
; CHECK-NEXT: stur b3, [x9, #2]
411411
; CHECK-NEXT: add x9, x9, #3
412412
; CHECK-NEXT: b.ne .LBB2_8
413413
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
563563
; CHECK-NEXT: fcmp s3, s1
564564
; CHECK-NEXT: fcsel s4, s1, s3, gt
565565
; CHECK-NEXT: fcmp s3, #0.0
566-
; CHECK-NEXT: fcvtzs w11, s2
566+
; CHECK-NEXT: fcvtzu s2, s2
567567
; CHECK-NEXT: ldp s3, s5, [x8, #8]
568568
; CHECK-NEXT: add x8, x8, #16
569569
; CHECK-NEXT: fcsel s4, s0, s4, mi
570570
; CHECK-NEXT: fcmp s3, s1
571-
; CHECK-NEXT: strb w11, [x9]
572-
; CHECK-NEXT: fcvtzs w12, s4
571+
; CHECK-NEXT: str b2, [x9]
572+
; CHECK-NEXT: fcvtzu s4, s4
573573
; CHECK-NEXT: fcsel s6, s1, s3, gt
574574
; CHECK-NEXT: fcmp s3, #0.0
575575
; CHECK-NEXT: fcsel s3, s0, s6, mi
576576
; CHECK-NEXT: fcmp s5, s1
577-
; CHECK-NEXT: strb w12, [x9, #1]
577+
; CHECK-NEXT: stur b4, [x9, #1]
578578
; CHECK-NEXT: fcsel s6, s1, s5, gt
579579
; CHECK-NEXT: fcmp s5, #0.0
580-
; CHECK-NEXT: fcvtzs w13, s3
581-
; CHECK-NEXT: fcsel s2, s0, s6, mi
580+
; CHECK-NEXT: fcvtzu s3, s3
581+
; CHECK-NEXT: fcsel s5, s0, s6, mi
582582
; CHECK-NEXT: subs w10, w10, #1
583-
; CHECK-NEXT: strb w13, [x9, #2]
584-
; CHECK-NEXT: fcvtzs w14, s2
585-
; CHECK-NEXT: strb w14, [x9, #3]
583+
; CHECK-NEXT: stur b3, [x9, #2]
584+
; CHECK-NEXT: fcvtzu s5, s5
585+
; CHECK-NEXT: stur b5, [x9, #3]
586586
; CHECK-NEXT: add x9, x9, #4
587587
; CHECK-NEXT: b.ne .LBB3_6
588588
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup

0 commit comments

Comments
 (0)