Skip to content

Commit 58d70dc

Browse files
authored
[AArch64] Keep floating-point conversion in SIMD (#147707)
Stores can be issued faster if the result is kept in the SIMD/FP registers. The `HasOneUse` guards against creating two floating point conversions, if for example there's some arithmetic done on the converted value as well. Another approach would be to inspect the user instructions during lowering, but I don't see that type of check in the lowering too often.
1 parent 97fa9a1 commit 58d70dc

File tree

5 files changed

+227
-23
lines changed

5 files changed

+227
-23
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24128,6 +24128,67 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
2412824128
Store->getMemOperand());
2412924129
}
2413024130

24131+
// Combine store (fp_to_int X) to use vector semantics around the conversion
24132+
// when NEON is available. This allows us to store the in-vector result directly
24133+
// without transferring the result into a GPR in the process.
24134+
static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
24135+
TargetLowering::DAGCombinerInfo &DCI,
24136+
SelectionDAG &DAG,
24137+
const AArch64Subtarget *Subtarget) {
24138+
// Limit to post-legalization in order to avoid peeling truncating stores.
24139+
if (DCI.isBeforeLegalize())
24140+
return SDValue();
24141+
if (!Subtarget->isNeonAvailable())
24142+
return SDValue();
24143+
// Source operand is already a vector.
24144+
SDValue Value = ST->getValue();
24145+
if (Value.getValueType().isVector())
24146+
return SDValue();
24147+
24148+
// Look through potential assertions.
24149+
while (Value->isAssert())
24150+
Value = Value.getOperand(0);
24151+
24152+
if (Value.getOpcode() != ISD::FP_TO_SINT &&
24153+
Value.getOpcode() != ISD::FP_TO_UINT)
24154+
return SDValue();
24155+
if (!Value->hasOneUse())
24156+
return SDValue();
24157+
24158+
SDValue FPSrc = Value.getOperand(0);
24159+
EVT SrcVT = FPSrc.getValueType();
24160+
if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
24161+
return SDValue();
24162+
24163+
// No support for assignments such as i64 = fp_to_sint i32
24164+
EVT VT = Value.getSimpleValueType();
24165+
if (VT != SrcVT.changeTypeToInteger())
24166+
return SDValue();
24167+
24168+
// Create a 128-bit element vector to avoid widening. The floating point
24169+
// conversion is transformed into a single element conversion via a pattern.
24170+
unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
24171+
EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
24172+
EVT VecDstVT = VecSrcVT.changeTypeToInteger();
24173+
SDLoc DL(ST);
24174+
SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
24175+
SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
24176+
24177+
if (ST->isTruncatingStore()) {
24178+
EVT NewVecDstVT = EVT::getVectorVT(
24179+
*DAG.getContext(), ST->getMemoryVT(),
24180+
VecDstVT.getFixedSizeInBits() / ST->getMemoryVT().getFixedSizeInBits());
24181+
VecConv = DAG.getNode(AArch64ISD::NVCAST, DL, NewVecDstVT, VecConv);
24182+
}
24183+
24184+
SDValue Zero = DAG.getVectorIdxConstant(0, DL);
24185+
SDValue Extracted =
24186+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
24187+
24188+
DCI.CombineTo(ST->getValue().getNode(), Extracted);
24189+
return SDValue(ST, 0);
24190+
}
24191+
2413124192
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
2413224193
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
2413324194
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24210,6 +24271,9 @@ static SDValue performSTORECombine(SDNode *N,
2421024271
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2421124272
SDLoc DL(ST);
2421224273

24274+
if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24275+
return Res;
24276+
2421324277
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
2421424278
EVT EltVT = VT.getVectorElementType();
2421524279
return EltVT == MVT::f32 || EltVT == MVT::f64;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6668,6 +6668,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
66686668
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
66696669
}
66706670

6671+
def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
6672+
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
6673+
def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
6674+
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
6675+
def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6676+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
6677+
def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6678+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
6679+
66716680
// int -> float conversion of value in lane 0 of simd vector should use
66726681
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
66736682
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),

llvm/test/CodeGen/AArch64/selectopt-const.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
2929
; CHECK-NEXT: csel x10, x9, xzr, lt
3030
; CHECK-NEXT: subs x8, x8, #1
3131
; CHECK-NEXT: ldr s3, [x4, x10]
32-
; CHECK-NEXT: fcvtzs w10, s3
33-
; CHECK-NEXT: str w10, [x2], #4
32+
; CHECK-NEXT: fcvtzs s3, s3
33+
; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
3434
; CHECK-NEXT: b.ne .LBB0_2
3535
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
3636
; CHECK-NEXT: mov w0, wzr
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define void @f32_to_u8(float %f, ptr %dst) {
5+
; CHECK-LABEL: f32_to_u8:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: fcvtzu s0, s0
8+
; CHECK-NEXT: str b0, [x0]
9+
; CHECK-NEXT: ret
10+
entry:
11+
%conv = fptoui float %f to i32
12+
%trunc = trunc i32 %conv to i8
13+
store i8 %trunc, ptr %dst
14+
ret void
15+
}
16+
17+
define void @f32_to_s8(float %f, ptr %dst) {
18+
; CHECK-LABEL: f32_to_s8:
19+
; CHECK: // %bb.0: // %entry
20+
; CHECK-NEXT: fcvtzs s0, s0
21+
; CHECK-NEXT: str b0, [x0]
22+
; CHECK-NEXT: ret
23+
entry:
24+
%conv = fptosi float %f to i32
25+
%trunc = trunc i32 %conv to i8
26+
store i8 %trunc, ptr %dst
27+
ret void
28+
}
29+
30+
define void @f32_to_u16(float %f, ptr %dst) {
31+
; CHECK-LABEL: f32_to_u16:
32+
; CHECK: // %bb.0: // %entry
33+
; CHECK-NEXT: fcvtzu s0, s0
34+
; CHECK-NEXT: str h0, [x0]
35+
; CHECK-NEXT: ret
36+
entry:
37+
%conv = fptoui float %f to i32
38+
%trunc = trunc i32 %conv to i16
39+
store i16 %trunc, ptr %dst
40+
ret void
41+
}
42+
43+
define void @f32_to_s16(float %f, ptr %dst) {
44+
; CHECK-LABEL: f32_to_s16:
45+
; CHECK: // %bb.0: // %entry
46+
; CHECK-NEXT: fcvtzs s0, s0
47+
; CHECK-NEXT: str h0, [x0]
48+
; CHECK-NEXT: ret
49+
entry:
50+
%conv = fptosi float %f to i32
51+
%trunc = trunc i32 %conv to i16
52+
store i16 %trunc, ptr %dst
53+
ret void
54+
}
55+
56+
define void @f32_to_u32(float %f, ptr %dst) {
57+
; CHECK-LABEL: f32_to_u32:
58+
; CHECK: // %bb.0: // %entry
59+
; CHECK-NEXT: fcvtzu s0, s0
60+
; CHECK-NEXT: str s0, [x0]
61+
; CHECK-NEXT: ret
62+
entry:
63+
%conv = fptoui float %f to i32
64+
store i32 %conv, ptr %dst
65+
ret void
66+
}
67+
68+
define void @f32_to_s32(float %f, ptr %dst) {
69+
; CHECK-LABEL: f32_to_s32:
70+
; CHECK: // %bb.0: // %entry
71+
; CHECK-NEXT: fcvtzs s0, s0
72+
; CHECK-NEXT: str s0, [x0]
73+
; CHECK-NEXT: ret
74+
entry:
75+
%conv = fptosi float %f to i32
76+
store i32 %conv, ptr %dst
77+
ret void
78+
}
79+
80+
define void @f32_to_s64(float %f, ptr %dst) {
81+
; CHECK-LABEL: f32_to_s64:
82+
; CHECK: // %bb.0: // %entry
83+
; CHECK-NEXT: fcvtzs w8, s0
84+
; CHECK-NEXT: sxtw x8, w8
85+
; CHECK-NEXT: str x8, [x0]
86+
; CHECK-NEXT: ret
87+
entry:
88+
%conv = fptosi float %f to i32
89+
%ext = sext i32 %conv to i64
90+
store i64 %ext, ptr %dst
91+
ret void
92+
}
93+
94+
define void @f64_to_u64(double %d, ptr %dst) {
95+
; CHECK-LABEL: f64_to_u64:
96+
; CHECK: // %bb.0: // %entry
97+
; CHECK-NEXT: fcvtzu d0, d0
98+
; CHECK-NEXT: str d0, [x0]
99+
; CHECK-NEXT: ret
100+
entry:
101+
%conv = fptoui double %d to i64
102+
store i64 %conv, ptr %dst
103+
ret void
104+
}
105+
106+
define void @f64_to_s64(double %d, ptr %dst) {
107+
; CHECK-LABEL: f64_to_s64:
108+
; CHECK: // %bb.0: // %entry
109+
; CHECK-NEXT: fcvtzs d0, d0
110+
; CHECK-NEXT: str d0, [x0]
111+
; CHECK-NEXT: ret
112+
entry:
113+
%conv = fptosi double %d to i64
114+
store i64 %conv, ptr %dst
115+
ret void
116+
}
117+
118+
define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
119+
; CHECK-LABEL: f32_to_i32_multiple_uses:
120+
; CHECK: // %bb.0: // %entry
121+
; CHECK-NEXT: fcvtzs w8, s0
122+
; CHECK-NEXT: mov x9, x0
123+
; CHECK-NEXT: mov w0, w8
124+
; CHECK-NEXT: strb w8, [x9]
125+
; CHECK-NEXT: ret
126+
entry:
127+
%conv = fptosi float %f to i32
128+
%trunc = trunc i32 %conv to i8
129+
store i8 %trunc, ptr %dst
130+
ret i32 %conv
131+
}

llvm/test/CodeGen/AArch64/tbl-loops.ll

Lines changed: 21 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
6363
; CHECK-NEXT: fcmp s2, #0.0
6464
; CHECK-NEXT: fcsel s2, s0, s3, mi
6565
; CHECK-NEXT: subs w10, w10, #1
66-
; CHECK-NEXT: fcvtzs w11, s2
67-
; CHECK-NEXT: strb w11, [x9], #1
66+
; CHECK-NEXT: fcvtzs s2, s2
67+
; CHECK-NEXT: st1 { v2.b }[0], [x9], #1
6868
; CHECK-NEXT: b.ne .LBB0_7
6969
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
7070
; CHECK-NEXT: ret
@@ -178,12 +178,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
178178
; CHECK-NEXT: fcmp s3, s1
179179
; CHECK-NEXT: fcsel s4, s1, s3, gt
180180
; CHECK-NEXT: fcmp s3, #0.0
181-
; CHECK-NEXT: fcvtzs w11, s2
181+
; CHECK-NEXT: fcvtzs s2, s2
182182
; CHECK-NEXT: fcsel s3, s0, s4, mi
183183
; CHECK-NEXT: subs w10, w10, #1
184-
; CHECK-NEXT: strb w11, [x9]
185-
; CHECK-NEXT: fcvtzs w12, s3
186-
; CHECK-NEXT: strb w12, [x9, #1]
184+
; CHECK-NEXT: str b2, [x9]
185+
; CHECK-NEXT: fcvtzs s3, s3
186+
; CHECK-NEXT: stur b3, [x9, #1]
187187
; CHECK-NEXT: add x9, x9, #2
188188
; CHECK-NEXT: b.ne .LBB1_6
189189
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +395,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
395395
; CHECK-NEXT: fcsel s4, s1, s3, gt
396396
; CHECK-NEXT: fcmp s3, #0.0
397397
; CHECK-NEXT: ldr s3, [x8, #8]
398-
; CHECK-NEXT: fcvtzs w11, s2
398+
; CHECK-NEXT: fcvtzs s2, s2
399399
; CHECK-NEXT: add x8, x8, #12
400400
; CHECK-NEXT: fcsel s4, s0, s4, mi
401401
; CHECK-NEXT: fcmp s3, s1
402-
; CHECK-NEXT: strb w11, [x9]
402+
; CHECK-NEXT: str b2, [x9]
403403
; CHECK-NEXT: fcsel s5, s1, s3, gt
404404
; CHECK-NEXT: fcmp s3, #0.0
405-
; CHECK-NEXT: fcvtzs w12, s4
405+
; CHECK-NEXT: fcvtzs s4, s4
406406
; CHECK-NEXT: fcsel s3, s0, s5, mi
407407
; CHECK-NEXT: subs w10, w10, #1
408-
; CHECK-NEXT: strb w12, [x9, #1]
409-
; CHECK-NEXT: fcvtzs w13, s3
410-
; CHECK-NEXT: strb w13, [x9, #2]
408+
; CHECK-NEXT: stur b4, [x9, #1]
409+
; CHECK-NEXT: fcvtzs s3, s3
410+
; CHECK-NEXT: stur b3, [x9, #2]
411411
; CHECK-NEXT: add x9, x9, #3
412412
; CHECK-NEXT: b.ne .LBB2_8
413413
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +563,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
563563
; CHECK-NEXT: fcmp s3, s1
564564
; CHECK-NEXT: fcsel s4, s1, s3, gt
565565
; CHECK-NEXT: fcmp s3, #0.0
566-
; CHECK-NEXT: fcvtzs w11, s2
566+
; CHECK-NEXT: fcvtzs s2, s2
567567
; CHECK-NEXT: ldp s3, s5, [x8, #8]
568568
; CHECK-NEXT: add x8, x8, #16
569569
; CHECK-NEXT: fcsel s4, s0, s4, mi
570570
; CHECK-NEXT: fcmp s3, s1
571-
; CHECK-NEXT: strb w11, [x9]
572-
; CHECK-NEXT: fcvtzs w12, s4
571+
; CHECK-NEXT: str b2, [x9]
572+
; CHECK-NEXT: fcvtzs s4, s4
573573
; CHECK-NEXT: fcsel s6, s1, s3, gt
574574
; CHECK-NEXT: fcmp s3, #0.0
575575
; CHECK-NEXT: fcsel s3, s0, s6, mi
576576
; CHECK-NEXT: fcmp s5, s1
577-
; CHECK-NEXT: strb w12, [x9, #1]
577+
; CHECK-NEXT: stur b4, [x9, #1]
578578
; CHECK-NEXT: fcsel s6, s1, s5, gt
579579
; CHECK-NEXT: fcmp s5, #0.0
580-
; CHECK-NEXT: fcvtzs w13, s3
581-
; CHECK-NEXT: fcsel s2, s0, s6, mi
580+
; CHECK-NEXT: fcvtzs s3, s3
581+
; CHECK-NEXT: fcsel s5, s0, s6, mi
582582
; CHECK-NEXT: subs w10, w10, #1
583-
; CHECK-NEXT: strb w13, [x9, #2]
584-
; CHECK-NEXT: fcvtzs w14, s2
585-
; CHECK-NEXT: strb w14, [x9, #3]
583+
; CHECK-NEXT: stur b3, [x9, #2]
584+
; CHECK-NEXT: fcvtzs s5, s5
585+
; CHECK-NEXT: stur b5, [x9, #3]
586586
; CHECK-NEXT: add x9, x9, #4
587587
; CHECK-NEXT: b.ne .LBB3_6
588588
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup

0 commit comments

Comments
 (0)