Skip to content

Commit 4eee9a4

Browse files
committed
[AArch64] Keep floating-point conversion in SIMD
1 parent 83dfdd8 commit 4eee9a4

File tree

5 files changed

+227
-23
lines changed

5 files changed

+227
-23
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24112,6 +24112,67 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
2411224112
Store->getMemOperand());
2411324113
}
2411424114

24115+
// Combine store (fp_to_int X) to use vector semantics around the conversion
24116+
// when NEON is available. This allows us to store the in-vector result directly
24117+
// without transferring the result into a GPR in the process.
24118+
static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
24119+
TargetLowering::DAGCombinerInfo &DCI,
24120+
SelectionDAG &DAG,
24121+
const AArch64Subtarget *Subtarget) {
24122+
// Limit to post-legalization in order to avoid peeling truncating stores.
24123+
if (DCI.isBeforeLegalize())
24124+
return SDValue();
24125+
if (!Subtarget->isNeonAvailable())
24126+
return SDValue();
24127+
// Source operand is already a vector.
24128+
SDValue Value = ST->getValue();
24129+
if (Value.getValueType().isVector())
24130+
return SDValue();
24131+
24132+
// Look through potential assertions.
24133+
while (Value->isAssert())
24134+
Value = Value.getOperand(0);
24135+
24136+
if (Value.getOpcode() != ISD::FP_TO_SINT &&
24137+
Value.getOpcode() != ISD::FP_TO_UINT)
24138+
return SDValue();
24139+
if (!Value->hasOneUse())
24140+
return SDValue();
24141+
24142+
SDValue FPSrc = Value.getOperand(0);
24143+
EVT SrcVT = FPSrc.getValueType();
24144+
if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24145+
return SDValue();
24146+
24147+
// No support for assignments such as i64 = fp_to_sint i32
24148+
EVT VT = Value.getSimpleValueType();
24149+
if (VT != SrcVT.changeTypeToInteger())
24150+
return SDValue();
24151+
24152+
// Create a 128-bit element vector to avoid widening. The floating point
24153+
// conversion is transformed into a single element conversion via a pattern.
24154+
unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24155+
EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24156+
EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24157+
SDLoc DL(ST);
24158+
SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24159+
SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24160+
24161+
if (ST->isTruncatingStore()) {
24162+
EVT NewVecDstVT = EVT::getVectorVT(
24163+
*DAG.getContext(), ST->getMemoryVT(),
24164+
VecDstVT.getFixedSizeInBits() / ST->getMemoryVT().getFixedSizeInBits());
24165+
VecConv = DAG.getNode(AArch64ISD::NVCAST, DL, NewVecDstVT, VecConv);
24166+
}
24167+
24168+
SDValue Zero = DAG.getVectorIdxConstant(0, DL);
24169+
SDValue Extracted =
24170+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24171+
24172+
DCI.CombineTo(ST->getValue().getNode(), Extracted);
24173+
return SDValue(ST, 0);
24174+
}
24175+
2411524176
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
2411624177
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
2411724178
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24194,6 +24255,9 @@ static SDValue performSTORECombine(SDNode *N,
2419424255
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2419524256
SDLoc DL(ST);
2419624257

24258+
if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24259+
return Res;
24260+
2419724261
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
2419824262
EVT EltVT = VT.getVectorElementType();
2419924263
return EltVT == MVT::f32 || EltVT == MVT::f64;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
66686668
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
66696669
}
66706670

6671+
def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
6672+
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
6673+
def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
6674+
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
6675+
def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6676+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
6677+
def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6678+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
6679+
66716680
// int -> float conversion of value in lane 0 of simd vector should use
66726681
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
66736682
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),

llvm/test/CodeGen/AArch64/selectopt-const.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
2929
; CHECK-NEXT: csel x10, x9, xzr, lt
3030
; CHECK-NEXT: subs x8, x8, #1
3131
; CHECK-NEXT: ldr s3, [x4, x10]
32-
; CHECK-NEXT: fcvtzs w10, s3
33-
; CHECK-NEXT: str w10, [x2], #4
32+
; CHECK-NEXT: fcvtzs s3, s3
33+
; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
3434
; CHECK-NEXT: b.ne .LBB0_2
3535
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
3636
; CHECK-NEXT: mov w0, wzr
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define void @f32_to_u8(float %f, ptr %dst) {
5+
; CHECK-LABEL: f32_to_u8:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: fcvtzu s0, s0
8+
; CHECK-NEXT: str b0, [x0]
9+
; CHECK-NEXT: ret
10+
entry:
11+
%conv = fptoui float %f to i32
12+
%trunc = trunc i32 %conv to i8
13+
store i8 %trunc, ptr %dst
14+
ret void
15+
}
16+
17+
define void @f32_to_s8(float %f, ptr %dst) {
18+
; CHECK-LABEL: f32_to_s8:
19+
; CHECK: // %bb.0: // %entry
20+
; CHECK-NEXT: fcvtzs s0, s0
21+
; CHECK-NEXT: str b0, [x0]
22+
; CHECK-NEXT: ret
23+
entry:
24+
%conv = fptosi float %f to i32
25+
%trunc = trunc i32 %conv to i8
26+
store i8 %trunc, ptr %dst
27+
ret void
28+
}
29+
30+
define void @f32_to_u16(float %f, ptr %dst) {
31+
; CHECK-LABEL: f32_to_u16:
32+
; CHECK: // %bb.0: // %entry
33+
; CHECK-NEXT: fcvtzu s0, s0
34+
; CHECK-NEXT: str h0, [x0]
35+
; CHECK-NEXT: ret
36+
entry:
37+
%conv = fptoui float %f to i32
38+
%trunc = trunc i32 %conv to i16
39+
store i16 %trunc, ptr %dst
40+
ret void
41+
}
42+
43+
define void @f32_to_s16(float %f, ptr %dst) {
44+
; CHECK-LABEL: f32_to_s16:
45+
; CHECK: // %bb.0: // %entry
46+
; CHECK-NEXT: fcvtzs s0, s0
47+
; CHECK-NEXT: str h0, [x0]
48+
; CHECK-NEXT: ret
49+
entry:
50+
%conv = fptosi float %f to i32
51+
%trunc = trunc i32 %conv to i16
52+
store i16 %trunc, ptr %dst
53+
ret void
54+
}
55+
56+
define void @f32_to_u32(float %f, ptr %dst) {
57+
; CHECK-LABEL: f32_to_u32:
58+
; CHECK: // %bb.0: // %entry
59+
; CHECK-NEXT: fcvtzu s0, s0
60+
; CHECK-NEXT: str s0, [x0]
61+
; CHECK-NEXT: ret
62+
entry:
63+
%conv = fptoui float %f to i32
64+
store i32 %conv, ptr %dst
65+
ret void
66+
}
67+
68+
define void @f32_to_s32(float %f, ptr %dst) {
69+
; CHECK-LABEL: f32_to_s32:
70+
; CHECK: // %bb.0: // %entry
71+
; CHECK-NEXT: fcvtzs s0, s0
72+
; CHECK-NEXT: str s0, [x0]
73+
; CHECK-NEXT: ret
74+
entry:
75+
%conv = fptosi float %f to i32
76+
store i32 %conv, ptr %dst
77+
ret void
78+
}
79+
80+
define void @f32_to_s64(float %f, ptr %dst) {
81+
; CHECK-LABEL: f32_to_s64:
82+
; CHECK: // %bb.0: // %entry
83+
; CHECK-NEXT: fcvtzs w8, s0
84+
; CHECK-NEXT: sxtw x8, w8
85+
; CHECK-NEXT: str x8, [x0]
86+
; CHECK-NEXT: ret
87+
entry:
88+
%conv = fptosi float %f to i32
89+
%ext = sext i32 %conv to i64
90+
store i64 %ext, ptr %dst
91+
ret void
92+
}
93+
94+
define void @f64_to_u64(double %d, ptr %dst) {
95+
; CHECK-LABEL: f64_to_u64:
96+
; CHECK: // %bb.0: // %entry
97+
; CHECK-NEXT: fcvtzu d0, d0
98+
; CHECK-NEXT: str d0, [x0]
99+
; CHECK-NEXT: ret
100+
entry:
101+
%conv = fptoui double %d to i64
102+
store i64 %conv, ptr %dst
103+
ret void
104+
}
105+
106+
define void @f64_to_s64(double %d, ptr %dst) {
107+
; CHECK-LABEL: f64_to_s64:
108+
; CHECK: // %bb.0: // %entry
109+
; CHECK-NEXT: fcvtzs d0, d0
110+
; CHECK-NEXT: str d0, [x0]
111+
; CHECK-NEXT: ret
112+
entry:
113+
%conv = fptosi double %d to i64
114+
store i64 %conv, ptr %dst
115+
ret void
116+
}
117+
118+
define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
119+
; CHECK-LABEL: f32_to_i32_multiple_uses:
120+
; CHECK: // %bb.0: // %entry
121+
; CHECK-NEXT: fcvtzs w8, s0
122+
; CHECK-NEXT: mov x9, x0
123+
; CHECK-NEXT: mov w0, w8
124+
; CHECK-NEXT: strb w8, [x9]
125+
; CHECK-NEXT: ret
126+
entry:
127+
%conv = fptosi float %f to i32
128+
%trunc = trunc i32 %conv to i8
129+
store i8 %trunc, ptr %dst
130+
ret i32 %conv
131+
}

llvm/test/CodeGen/AArch64/tbl-loops.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
6363
; CHECK-NEXT: fcmp s2, #0.0
6464
; CHECK-NEXT: fcsel s2, s0, s3, mi
6565
; CHECK-NEXT: subs w10, w10, #1
66-
; CHECK-NEXT: fcvtzs w11, s2
67-
; CHECK-NEXT: strb w11, [x9], #1
66+
; CHECK-NEXT: fcvtzs s2, s2
67+
; CHECK-NEXT: st1 { v2.b }[0], [x9], #1
6868
; CHECK-NEXT: b.ne .LBB0_7
6969
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
7070
; CHECK-NEXT: ret
@@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
178178
; CHECK-NEXT: fcmp s3, s1
179179
; CHECK-NEXT: fcsel s4, s1, s3, gt
180180
; CHECK-NEXT: fcmp s3, #0.0
181-
; CHECK-NEXT: fcvtzs w11, s2
181+
; CHECK-NEXT: fcvtzs s2, s2
182182
; CHECK-NEXT: fcsel s3, s0, s4, mi
183183
; CHECK-NEXT: subs w10, w10, #1
184-
; CHECK-NEXT: strb w11, [x9]
185-
; CHECK-NEXT: fcvtzs w12, s3
186-
; CHECK-NEXT: strb w12, [x9, #1]
184+
; CHECK-NEXT: str b2, [x9]
185+
; CHECK-NEXT: fcvtzs s3, s3
186+
; CHECK-NEXT: stur b3, [x9, #1]
187187
; CHECK-NEXT: add x9, x9, #2
188188
; CHECK-NEXT: b.ne .LBB1_6
189189
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
395395
; CHECK-NEXT: fcsel s4, s1, s3, gt
396396
; CHECK-NEXT: fcmp s3, #0.0
397397
; CHECK-NEXT: ldr s3, [x8, #8]
398-
; CHECK-NEXT: fcvtzs w11, s2
398+
; CHECK-NEXT: fcvtzs s2, s2
399399
; CHECK-NEXT: add x8, x8, #12
400400
; CHECK-NEXT: fcsel s4, s0, s4, mi
401401
; CHECK-NEXT: fcmp s3, s1
402-
; CHECK-NEXT: strb w11, [x9]
402+
; CHECK-NEXT: str b2, [x9]
403403
; CHECK-NEXT: fcsel s5, s1, s3, gt
404404
; CHECK-NEXT: fcmp s3, #0.0
405-
; CHECK-NEXT: fcvtzs w12, s4
405+
; CHECK-NEXT: fcvtzs s4, s4
406406
; CHECK-NEXT: fcsel s3, s0, s5, mi
407407
; CHECK-NEXT: subs w10, w10, #1
408-
; CHECK-NEXT: strb w12, [x9, #1]
409-
; CHECK-NEXT: fcvtzs w13, s3
410-
; CHECK-NEXT: strb w13, [x9, #2]
408+
; CHECK-NEXT: stur b4, [x9, #1]
409+
; CHECK-NEXT: fcvtzs s3, s3
410+
; CHECK-NEXT: stur b3, [x9, #2]
411411
; CHECK-NEXT: add x9, x9, #3
412412
; CHECK-NEXT: b.ne .LBB2_8
413413
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
563563
; CHECK-NEXT: fcmp s3, s1
564564
; CHECK-NEXT: fcsel s4, s1, s3, gt
565565
; CHECK-NEXT: fcmp s3, #0.0
566-
; CHECK-NEXT: fcvtzs w11, s2
566+
; CHECK-NEXT: fcvtzs s2, s2
567567
; CHECK-NEXT: ldp s3, s5, [x8, #8]
568568
; CHECK-NEXT: add x8, x8, #16
569569
; CHECK-NEXT: fcsel s4, s0, s4, mi
570570
; CHECK-NEXT: fcmp s3, s1
571-
; CHECK-NEXT: strb w11, [x9]
572-
; CHECK-NEXT: fcvtzs w12, s4
571+
; CHECK-NEXT: str b2, [x9]
572+
; CHECK-NEXT: fcvtzs s4, s4
573573
; CHECK-NEXT: fcsel s6, s1, s3, gt
574574
; CHECK-NEXT: fcmp s3, #0.0
575575
; CHECK-NEXT: fcsel s3, s0, s6, mi
576576
; CHECK-NEXT: fcmp s5, s1
577-
; CHECK-NEXT: strb w12, [x9, #1]
577+
; CHECK-NEXT: stur b4, [x9, #1]
578578
; CHECK-NEXT: fcsel s6, s1, s5, gt
579579
; CHECK-NEXT: fcmp s5, #0.0
580-
; CHECK-NEXT: fcvtzs w13, s3
581-
; CHECK-NEXT: fcsel s2, s0, s6, mi
580+
; CHECK-NEXT: fcvtzs s3, s3
581+
; CHECK-NEXT: fcsel s5, s0, s6, mi
582582
; CHECK-NEXT: subs w10, w10, #1
583-
; CHECK-NEXT: strb w13, [x9, #2]
584-
; CHECK-NEXT: fcvtzs w14, s2
585-
; CHECK-NEXT: strb w14, [x9, #3]
583+
; CHECK-NEXT: stur b3, [x9, #2]
584+
; CHECK-NEXT: fcvtzs s5, s5
585+
; CHECK-NEXT: stur b5, [x9, #3]
586586
; CHECK-NEXT: add x9, x9, #4
587587
; CHECK-NEXT: b.ne .LBB3_6
588588
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup

0 commit comments

Comments
 (0)