Skip to content

Commit d493cd2

Browse files
authored
Merge pull request #11886 from swiftlang/guy-david/aarch64-simd-fp-conversion
🍒 [AArch64] Keep floating-point conversion in SIMD
2 parents 35f4830 + a87b3b4 commit d493cd2

File tree

5 files changed

+220
-22
lines changed

5 files changed

+220
-22
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23942,6 +23942,60 @@ static SDValue combineBoolVectorAndTruncateStore(SelectionDAG &DAG,
2394223942
Store->getMemOperand());
2394323943
}
2394423944

23945+
// Combine store (fp_to_int X) to use vector semantics around the conversion
23946+
// when NEON is available. This allows us to store the in-vector result directly
23947+
// without transferring the result into a GPR in the process.
23948+
static SDValue combineStoreValueFPToInt(StoreSDNode *ST,
23949+
TargetLowering::DAGCombinerInfo &DCI,
23950+
SelectionDAG &DAG,
23951+
const AArch64Subtarget *Subtarget) {
23952+
// Limit to post-legalization in order to avoid peeling truncating stores.
23953+
if (DCI.isBeforeLegalize())
23954+
return SDValue();
23955+
if (!Subtarget->isNeonAvailable())
23956+
return SDValue();
23957+
// Source operand is already a vector.
23958+
SDValue Value = ST->getValue();
23959+
if (Value.getValueType().isVector())
23960+
return SDValue();
23961+
23962+
// Look through potential assertions.
23963+
while (Value->isAssert())
23964+
Value = Value.getOperand(0);
23965+
23966+
if (Value.getOpcode() != ISD::FP_TO_SINT &&
23967+
Value.getOpcode() != ISD::FP_TO_UINT)
23968+
return SDValue();
23969+
if (!Value->hasOneUse())
23970+
return SDValue();
23971+
23972+
SDValue FPSrc = Value.getOperand(0);
23973+
EVT SrcVT = FPSrc.getValueType();
23974+
if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
23975+
return SDValue();
23976+
23977+
// No support for assignments such as i64 = fp_to_sint i32
23978+
EVT VT = Value.getSimpleValueType();
23979+
if (VT != SrcVT.changeTypeToInteger())
23980+
return SDValue();
23981+
23982+
// Create a 128-bit element vector to avoid widening. The floating point
23983+
// conversion is transformed into a single element conversion via a pattern.
23984+
unsigned NumElements = 128 / SrcVT.getFixedSizeInBits();
23985+
EVT VecSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT, NumElements);
23986+
EVT VecDstVT = VecSrcVT.changeTypeToInteger();
23987+
SDLoc DL(ST);
23988+
SDValue VecFP = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, FPSrc);
23989+
SDValue VecConv = DAG.getNode(Value.getOpcode(), DL, VecDstVT, VecFP);
23990+
23991+
SDValue Zero = DAG.getVectorIdxConstant(0, DL);
23992+
SDValue Extracted =
23993+
DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VecConv, Zero);
23994+
23995+
DCI.CombineTo(ST->getValue().getNode(), Extracted);
23996+
return SDValue(ST, 0);
23997+
}
23998+
2394523999
bool isHalvingTruncateOfLegalScalableType(EVT SrcVT, EVT DstVT) {
2394624000
return (SrcVT == MVT::nxv8i16 && DstVT == MVT::nxv8i8) ||
2394724001
(SrcVT == MVT::nxv4i32 && DstVT == MVT::nxv4i16) ||
@@ -24024,6 +24078,9 @@ static SDValue performSTORECombine(SDNode *N,
2402424078
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
2402524079
SDLoc DL(ST);
2402624080

24081+
if (SDValue Res = combineStoreValueFPToInt(ST, DCI, DAG, Subtarget))
24082+
return Res;
24083+
2402724084
auto hasValidElementTypeForFPTruncStore = [](EVT VT) {
2402824085
EVT EltVT = VT.getVectorElementType();
2402924086
return EltVT == MVT::f32 || EltVT == MVT::f64;

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6646,6 +6646,15 @@ def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
66466646
(UCVTFv1i16 (f16 (FCVTZUv1f16 f16:$Rn)))>;
66476647
}
66486648

6649+
def : Pat<(v4i32 (any_fp_to_sint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
6650+
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZSv1i32 (f32 FPR32:$src))), ssub))>;
6651+
def : Pat<(v4i32 (any_fp_to_uint (v4f32 (scalar_to_vector (f32 FPR32:$src))))),
6652+
(v4i32 (INSERT_SUBREG (IMPLICIT_DEF), (i32 (FCVTZUv1i32 (f32 FPR32:$src))), ssub))>;
6653+
def : Pat<(v2i64 (any_fp_to_sint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6654+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZSv1i64 (f64 FPR64:$src))), dsub))>;
6655+
def : Pat<(v2i64 (any_fp_to_uint (v2f64 (scalar_to_vector (f64 FPR64:$src))))),
6656+
(v2i64 (INSERT_SUBREG (IMPLICIT_DEF), (i64 (FCVTZUv1i64 (f64 FPR64:$src))), dsub))>;
6657+
66496658
// int -> float conversion of value in lane 0 of simd vector should use
66506659
// correct cvtf variant to avoid costly fpr <-> gpr register transfers.
66516660
def : Pat<(f32 (sint_to_fp (i32 (vector_extract (v4i32 FPR128:$Rn), (i64 0))))),

llvm/test/CodeGen/AArch64/selectopt-const.ll

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ define i32 @test_const(ptr %in1, ptr %in2, ptr %out, i32 %n, ptr %tbl) {
2929
; CHECK-NEXT: csel x10, x9, xzr, lt
3030
; CHECK-NEXT: subs x8, x8, #1
3131
; CHECK-NEXT: ldr s3, [x4, x10]
32-
; CHECK-NEXT: fcvtzs w10, s3
33-
; CHECK-NEXT: str w10, [x2], #4
32+
; CHECK-NEXT: fcvtzs s3, s3
33+
; CHECK-NEXT: st1 { v3.s }[0], [x2], #4
3434
; CHECK-NEXT: b.ne .LBB0_2
3535
; CHECK-NEXT: .LBB0_3: // %for.cond.cleanup
3636
; CHECK-NEXT: mov w0, wzr
Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,131 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc -verify-machineinstrs -mtriple=aarch64 < %s | FileCheck %s
3+
4+
define void @f32_to_u8(float %f, ptr %dst) {
5+
; CHECK-LABEL: f32_to_u8:
6+
; CHECK: // %bb.0: // %entry
7+
; CHECK-NEXT: fcvtzu s0, s0
8+
; CHECK-NEXT: str b0, [x0]
9+
; CHECK-NEXT: ret
10+
entry:
11+
%conv = fptoui float %f to i32
12+
%trunc = trunc i32 %conv to i8
13+
store i8 %trunc, ptr %dst
14+
ret void
15+
}
16+
17+
define void @f32_to_s8(float %f, ptr %dst) {
18+
; CHECK-LABEL: f32_to_s8:
19+
; CHECK: // %bb.0: // %entry
20+
; CHECK-NEXT: fcvtzs s0, s0
21+
; CHECK-NEXT: str b0, [x0]
22+
; CHECK-NEXT: ret
23+
entry:
24+
%conv = fptosi float %f to i32
25+
%trunc = trunc i32 %conv to i8
26+
store i8 %trunc, ptr %dst
27+
ret void
28+
}
29+
30+
define void @f32_to_u16(float %f, ptr %dst) {
31+
; CHECK-LABEL: f32_to_u16:
32+
; CHECK: // %bb.0: // %entry
33+
; CHECK-NEXT: fcvtzu s0, s0
34+
; CHECK-NEXT: str h0, [x0]
35+
; CHECK-NEXT: ret
36+
entry:
37+
%conv = fptoui float %f to i32
38+
%trunc = trunc i32 %conv to i16
39+
store i16 %trunc, ptr %dst
40+
ret void
41+
}
42+
43+
define void @f32_to_s16(float %f, ptr %dst) {
44+
; CHECK-LABEL: f32_to_s16:
45+
; CHECK: // %bb.0: // %entry
46+
; CHECK-NEXT: fcvtzs s0, s0
47+
; CHECK-NEXT: str h0, [x0]
48+
; CHECK-NEXT: ret
49+
entry:
50+
%conv = fptosi float %f to i32
51+
%trunc = trunc i32 %conv to i16
52+
store i16 %trunc, ptr %dst
53+
ret void
54+
}
55+
56+
define void @f32_to_u32(float %f, ptr %dst) {
57+
; CHECK-LABEL: f32_to_u32:
58+
; CHECK: // %bb.0: // %entry
59+
; CHECK-NEXT: fcvtzu s0, s0
60+
; CHECK-NEXT: str s0, [x0]
61+
; CHECK-NEXT: ret
62+
entry:
63+
%conv = fptoui float %f to i32
64+
store i32 %conv, ptr %dst
65+
ret void
66+
}
67+
68+
define void @f32_to_s32(float %f, ptr %dst) {
69+
; CHECK-LABEL: f32_to_s32:
70+
; CHECK: // %bb.0: // %entry
71+
; CHECK-NEXT: fcvtzs s0, s0
72+
; CHECK-NEXT: str s0, [x0]
73+
; CHECK-NEXT: ret
74+
entry:
75+
%conv = fptosi float %f to i32
76+
store i32 %conv, ptr %dst
77+
ret void
78+
}
79+
80+
define void @f32_to_s64(float %f, ptr %dst) {
81+
; CHECK-LABEL: f32_to_s64:
82+
; CHECK: // %bb.0: // %entry
83+
; CHECK-NEXT: fcvtzs w8, s0
84+
; CHECK-NEXT: sxtw x8, w8
85+
; CHECK-NEXT: str x8, [x0]
86+
; CHECK-NEXT: ret
87+
entry:
88+
%conv = fptosi float %f to i32
89+
%ext = sext i32 %conv to i64
90+
store i64 %ext, ptr %dst
91+
ret void
92+
}
93+
94+
define void @f64_to_u64(double %d, ptr %dst) {
95+
; CHECK-LABEL: f64_to_u64:
96+
; CHECK: // %bb.0: // %entry
97+
; CHECK-NEXT: fcvtzu d0, d0
98+
; CHECK-NEXT: str d0, [x0]
99+
; CHECK-NEXT: ret
100+
entry:
101+
%conv = fptoui double %d to i64
102+
store i64 %conv, ptr %dst
103+
ret void
104+
}
105+
106+
define void @f64_to_s64(double %d, ptr %dst) {
107+
; CHECK-LABEL: f64_to_s64:
108+
; CHECK: // %bb.0: // %entry
109+
; CHECK-NEXT: fcvtzs d0, d0
110+
; CHECK-NEXT: str d0, [x0]
111+
; CHECK-NEXT: ret
112+
entry:
113+
%conv = fptosi double %d to i64
114+
store i64 %conv, ptr %dst
115+
ret void
116+
}
117+
118+
define i32 @f32_to_i32_multiple_uses(float %f, ptr %dst) {
119+
; CHECK-LABEL: f32_to_i32_multiple_uses:
120+
; CHECK: // %bb.0: // %entry
121+
; CHECK-NEXT: fcvtzs w8, s0
122+
; CHECK-NEXT: mov x9, x0
123+
; CHECK-NEXT: mov w0, w8
124+
; CHECK-NEXT: strb w8, [x9]
125+
; CHECK-NEXT: ret
126+
entry:
127+
%conv = fptosi float %f to i32
128+
%trunc = trunc i32 %conv to i8
129+
store i8 %trunc, ptr %dst
130+
ret i32 %conv
131+
}

llvm/test/CodeGen/AArch64/tbl-loops.ll

Lines changed: 21 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,8 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
6363
; CHECK-NEXT: fcmp s2, #0.0
6464
; CHECK-NEXT: fcsel s2, s0, s3, mi
6565
; CHECK-NEXT: subs w10, w10, #1
66-
; CHECK-NEXT: fcvtzs w11, s2
66+
; CHECK-NEXT: fcvtzs s2, s2
67+
; CHECK-NEXT: fmov w11, s2
6768
; CHECK-NEXT: strb w11, [x9], #1
6869
; CHECK-NEXT: b.ne .LBB0_7
6970
; CHECK-NEXT: .LBB0_8: // %for.cond.cleanup
@@ -178,12 +179,12 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
178179
; CHECK-NEXT: fcmp s3, s1
179180
; CHECK-NEXT: fcsel s4, s1, s3, gt
180181
; CHECK-NEXT: fcmp s3, #0.0
181-
; CHECK-NEXT: fcvtzs w11, s2
182+
; CHECK-NEXT: fcvtzs s2, s2
182183
; CHECK-NEXT: fcsel s3, s0, s4, mi
183184
; CHECK-NEXT: subs w10, w10, #1
184-
; CHECK-NEXT: strb w11, [x9]
185-
; CHECK-NEXT: fcvtzs w12, s3
186-
; CHECK-NEXT: strb w12, [x9, #1]
185+
; CHECK-NEXT: str b2, [x9]
186+
; CHECK-NEXT: fcvtzs s3, s3
187+
; CHECK-NEXT: stur b3, [x9, #1]
187188
; CHECK-NEXT: add x9, x9, #2
188189
; CHECK-NEXT: b.ne .LBB1_6
189190
; CHECK-NEXT: .LBB1_7: // %for.cond.cleanup
@@ -395,19 +396,19 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
395396
; CHECK-NEXT: fcsel s4, s1, s3, gt
396397
; CHECK-NEXT: fcmp s3, #0.0
397398
; CHECK-NEXT: ldr s3, [x8, #8]
398-
; CHECK-NEXT: fcvtzs w11, s2
399+
; CHECK-NEXT: fcvtzs s2, s2
399400
; CHECK-NEXT: add x8, x8, #12
400401
; CHECK-NEXT: fcsel s4, s0, s4, mi
401402
; CHECK-NEXT: fcmp s3, s1
402-
; CHECK-NEXT: strb w11, [x9]
403+
; CHECK-NEXT: str b2, [x9]
403404
; CHECK-NEXT: fcsel s5, s1, s3, gt
404405
; CHECK-NEXT: fcmp s3, #0.0
405-
; CHECK-NEXT: fcvtzs w12, s4
406+
; CHECK-NEXT: fcvtzs s4, s4
406407
; CHECK-NEXT: fcsel s3, s0, s5, mi
407408
; CHECK-NEXT: subs w10, w10, #1
408-
; CHECK-NEXT: strb w12, [x9, #1]
409-
; CHECK-NEXT: fcvtzs w13, s3
410-
; CHECK-NEXT: strb w13, [x9, #2]
409+
; CHECK-NEXT: stur b4, [x9, #1]
410+
; CHECK-NEXT: fcvtzs s3, s3
411+
; CHECK-NEXT: stur b3, [x9, #2]
411412
; CHECK-NEXT: add x9, x9, #3
412413
; CHECK-NEXT: b.ne .LBB2_8
413414
; CHECK-NEXT: .LBB2_9: // %for.cond.cleanup
@@ -563,26 +564,26 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
563564
; CHECK-NEXT: fcmp s3, s1
564565
; CHECK-NEXT: fcsel s4, s1, s3, gt
565566
; CHECK-NEXT: fcmp s3, #0.0
566-
; CHECK-NEXT: fcvtzs w11, s2
567+
; CHECK-NEXT: fcvtzs s2, s2
567568
; CHECK-NEXT: ldp s3, s5, [x8, #8]
568569
; CHECK-NEXT: add x8, x8, #16
569570
; CHECK-NEXT: fcsel s4, s0, s4, mi
570571
; CHECK-NEXT: fcmp s3, s1
571-
; CHECK-NEXT: strb w11, [x9]
572-
; CHECK-NEXT: fcvtzs w12, s4
572+
; CHECK-NEXT: str b2, [x9]
573+
; CHECK-NEXT: fcvtzs s4, s4
573574
; CHECK-NEXT: fcsel s6, s1, s3, gt
574575
; CHECK-NEXT: fcmp s3, #0.0
575576
; CHECK-NEXT: fcsel s3, s0, s6, mi
576577
; CHECK-NEXT: fcmp s5, s1
577-
; CHECK-NEXT: strb w12, [x9, #1]
578+
; CHECK-NEXT: stur b4, [x9, #1]
578579
; CHECK-NEXT: fcsel s6, s1, s5, gt
579580
; CHECK-NEXT: fcmp s5, #0.0
580-
; CHECK-NEXT: fcvtzs w13, s3
581-
; CHECK-NEXT: fcsel s2, s0, s6, mi
581+
; CHECK-NEXT: fcvtzs s3, s3
582+
; CHECK-NEXT: fcsel s5, s0, s6, mi
582583
; CHECK-NEXT: subs w10, w10, #1
583-
; CHECK-NEXT: strb w13, [x9, #2]
584-
; CHECK-NEXT: fcvtzs w14, s2
585-
; CHECK-NEXT: strb w14, [x9, #3]
584+
; CHECK-NEXT: stur b3, [x9, #2]
585+
; CHECK-NEXT: fcvtzs s5, s5
586+
; CHECK-NEXT: stur b5, [x9, #3]
586587
; CHECK-NEXT: add x9, x9, #4
587588
; CHECK-NEXT: b.ne .LBB3_6
588589
; CHECK-NEXT: .LBB3_7: // %for.cond.cleanup

0 commit comments

Comments
 (0)