Skip to content

Commit 865f956

Browse files
authored
[AArch64][SDAG] Add f16 -> i16 rounding NEON conversion intrinsics (#155851)
Add dedicated .i16.f16 formats for rounding NEON conversion intrinsics in order to avoid issues with incorrect overflow behaviour caused by using .i32.f16 formats to perform the same conversions. Added intrinsic formats: i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half) i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half) i16 @llvm.aarch64.neon.fcvtas.i16.f16(half) i16 @llvm.aarch64.neon.fcvtau.i16.f16(half) i16 @llvm.aarch64.neon.fcvtms.i16.f16(half) i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half) i16 @llvm.aarch64.neon.fcvtns.i16.f16(half) i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half) i16 @llvm.aarch64.neon.fcvtps.i16.f16(half) i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half) Backend side of the solution to #154343 --------- Signed-off-by: Kajetan Puchalski <[email protected]>
1 parent 63d9e3c commit 865f956

File tree

4 files changed

+294
-4
lines changed

4 files changed

+294
-4
lines changed

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22189,6 +22189,17 @@ static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
2218922189
Zero);
2219022190
}
2219122191

22192+
static SDValue tryCombineNeonFcvtFP16ToI16(SDNode *N, unsigned Opcode,
22193+
SelectionDAG &DAG) {
22194+
if (N->getValueType(0) != MVT::i16)
22195+
return SDValue();
22196+
22197+
SDLoc DL(N);
22198+
SDValue CVT = DAG.getNode(Opcode, DL, MVT::f32, N->getOperand(1));
22199+
SDValue Bitcast = DAG.getBitcast(MVT::i32, CVT);
22200+
return DAG.getNode(ISD::TRUNCATE, DL, MVT::i16, Bitcast);
22201+
}
22202+
2219222203
// If a merged operation has no inactive lanes we can relax it to a predicated
2219322204
// or unpredicated operation, which potentially allows better isel (perhaps
2219422205
// using immediate forms) or relaxing register reuse requirements.
@@ -22442,6 +22453,26 @@ static SDValue performIntrinsicCombine(SDNode *N,
2244222453
case Intrinsic::aarch64_neon_uabd:
2244322454
return DAG.getNode(ISD::ABDU, SDLoc(N), N->getValueType(0),
2244422455
N->getOperand(1), N->getOperand(2));
22456+
case Intrinsic::aarch64_neon_fcvtzs:
22457+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZS_HALF, DAG);
22458+
case Intrinsic::aarch64_neon_fcvtzu:
22459+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTZU_HALF, DAG);
22460+
case Intrinsic::aarch64_neon_fcvtas:
22461+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAS_HALF, DAG);
22462+
case Intrinsic::aarch64_neon_fcvtau:
22463+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTAU_HALF, DAG);
22464+
case Intrinsic::aarch64_neon_fcvtms:
22465+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMS_HALF, DAG);
22466+
case Intrinsic::aarch64_neon_fcvtmu:
22467+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTMU_HALF, DAG);
22468+
case Intrinsic::aarch64_neon_fcvtns:
22469+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNS_HALF, DAG);
22470+
case Intrinsic::aarch64_neon_fcvtnu:
22471+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTNU_HALF, DAG);
22472+
case Intrinsic::aarch64_neon_fcvtps:
22473+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPS_HALF, DAG);
22474+
case Intrinsic::aarch64_neon_fcvtpu:
22475+
return tryCombineNeonFcvtFP16ToI16(N, AArch64ISD::FCVTPU_HALF, DAG);
2244522476
case Intrinsic::aarch64_crc32b:
2244622477
case Intrinsic::aarch64_crc32cb:
2244722478
return tryCombineCRC32(0xff, N, DAG);

llvm/lib/Target/AArch64/AArch64InstrInfo.td

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -991,6 +991,14 @@ def AArch64fcvtxnv: PatFrags<(ops node:$Rn),
991991

992992
def AArch64fcvtzs_half : SDNode<"AArch64ISD::FCVTZS_HALF", SDTFPExtendOp>;
993993
def AArch64fcvtzu_half : SDNode<"AArch64ISD::FCVTZU_HALF", SDTFPExtendOp>;
994+
def AArch64fcvtas_half : SDNode<"AArch64ISD::FCVTAS_HALF", SDTFPExtendOp>;
995+
def AArch64fcvtau_half : SDNode<"AArch64ISD::FCVTAU_HALF", SDTFPExtendOp>;
996+
def AArch64fcvtms_half : SDNode<"AArch64ISD::FCVTMS_HALF", SDTFPExtendOp>;
997+
def AArch64fcvtmu_half : SDNode<"AArch64ISD::FCVTMU_HALF", SDTFPExtendOp>;
998+
def AArch64fcvtns_half : SDNode<"AArch64ISD::FCVTNS_HALF", SDTFPExtendOp>;
999+
def AArch64fcvtnu_half : SDNode<"AArch64ISD::FCVTNU_HALF", SDTFPExtendOp>;
1000+
def AArch64fcvtps_half : SDNode<"AArch64ISD::FCVTPS_HALF", SDTFPExtendOp>;
1001+
def AArch64fcvtpu_half : SDNode<"AArch64ISD::FCVTPU_HALF", SDTFPExtendOp>;
9941002

9951003
//def Aarch64softf32tobf16v8: SDNode<"AArch64ISD::", SDTFPRoundOp>;
9961004

@@ -6586,6 +6594,14 @@ class F16ToI16ScalarPat<SDNode cvt_isd, BaseSIMDTwoScalar instr>
65866594
let Predicates = [HasFullFP16] in {
65876595
def : F16ToI16ScalarPat<AArch64fcvtzs_half, FCVTZSv1f16>;
65886596
def : F16ToI16ScalarPat<AArch64fcvtzu_half, FCVTZUv1f16>;
6597+
def : F16ToI16ScalarPat<AArch64fcvtas_half, FCVTASv1f16>;
6598+
def : F16ToI16ScalarPat<AArch64fcvtau_half, FCVTAUv1f16>;
6599+
def : F16ToI16ScalarPat<AArch64fcvtms_half, FCVTMSv1f16>;
6600+
def : F16ToI16ScalarPat<AArch64fcvtmu_half, FCVTMUv1f16>;
6601+
def : F16ToI16ScalarPat<AArch64fcvtns_half, FCVTNSv1f16>;
6602+
def : F16ToI16ScalarPat<AArch64fcvtnu_half, FCVTNUv1f16>;
6603+
def : F16ToI16ScalarPat<AArch64fcvtps_half, FCVTPSv1f16>;
6604+
def : F16ToI16ScalarPat<AArch64fcvtpu_half, FCVTPUv1f16>;
65896605
}
65906606

65916607
// Round FP64 to BF16.
Lines changed: 128 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,128 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
2+
; RUN: llc < %s -mtriple=aarch64 -global-isel=0 -mattr=+v8.2a,+fullfp16 | FileCheck %s
3+
4+
; Test f16 -> i16 NEON intrinics, currently only supported in SDAG.
5+
; Should be merged with fp16_intrinsic_scalar_1op.ll once there is
6+
; support in GlSel.
7+
8+
declare i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half)
9+
declare i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half)
10+
declare i16 @llvm.aarch64.neon.fcvtas.i16.f16(half)
11+
declare i16 @llvm.aarch64.neon.fcvtau.i16.f16(half)
12+
declare i16 @llvm.aarch64.neon.fcvtms.i16.f16(half)
13+
declare i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half)
14+
declare i16 @llvm.aarch64.neon.fcvtns.i16.f16(half)
15+
declare i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half)
16+
declare i16 @llvm.aarch64.neon.fcvtps.i16.f16(half)
17+
declare i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half)
18+
19+
20+
define i16 @fcvtzs_intrinsic_i16(half %a) {
21+
; CHECK-LABEL: fcvtzs_intrinsic_i16:
22+
; CHECK: // %bb.0: // %entry
23+
; CHECK-NEXT: fcvtzs h0, h0
24+
; CHECK-NEXT: fmov w0, s0
25+
; CHECK-NEXT: ret
26+
entry:
27+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtzs.i16.f16(half %a)
28+
ret i16 %fcvt
29+
}
30+
31+
define i16 @fcvtzu_intrinsic_i16(half %a) {
32+
; CHECK-LABEL: fcvtzu_intrinsic_i16:
33+
; CHECK: // %bb.0: // %entry
34+
; CHECK-NEXT: fcvtzu h0, h0
35+
; CHECK-NEXT: fmov w0, s0
36+
; CHECK-NEXT: ret
37+
entry:
38+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtzu.i16.f16(half %a)
39+
ret i16 %fcvt
40+
}
41+
42+
define i16 @fcvtas_intrinsic_i16(half %a) {
43+
; CHECK-LABEL: fcvtas_intrinsic_i16:
44+
; CHECK: // %bb.0: // %entry
45+
; CHECK-NEXT: fcvtas h0, h0
46+
; CHECK-NEXT: fmov w0, s0
47+
; CHECK-NEXT: ret
48+
entry:
49+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtas.i16.f16(half %a)
50+
ret i16 %fcvt
51+
}
52+
53+
define i16 @fcvtau_intrinsic_i16(half %a) {
54+
; CHECK-LABEL: fcvtau_intrinsic_i16:
55+
; CHECK: // %bb.0: // %entry
56+
; CHECK-NEXT: fcvtau h0, h0
57+
; CHECK-NEXT: fmov w0, s0
58+
; CHECK-NEXT: ret
59+
entry:
60+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtau.i16.f16(half %a)
61+
ret i16 %fcvt
62+
}
63+
64+
define i16 @fcvtms_intrinsic_i16(half %a) {
65+
; CHECK-LABEL: fcvtms_intrinsic_i16:
66+
; CHECK: // %bb.0: // %entry
67+
; CHECK-NEXT: fcvtms h0, h0
68+
; CHECK-NEXT: fmov w0, s0
69+
; CHECK-NEXT: ret
70+
entry:
71+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtms.i16.f16(half %a)
72+
ret i16 %fcvt
73+
}
74+
75+
define i16 @fcvtmu_intrinsic_i16(half %a) {
76+
; CHECK-LABEL: fcvtmu_intrinsic_i16:
77+
; CHECK: // %bb.0: // %entry
78+
; CHECK-NEXT: fcvtmu h0, h0
79+
; CHECK-NEXT: fmov w0, s0
80+
; CHECK-NEXT: ret
81+
entry:
82+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtmu.i16.f16(half %a)
83+
ret i16 %fcvt
84+
}
85+
86+
define i16 @fcvtns_intrinsic_i16(half %a) {
87+
; CHECK-LABEL: fcvtns_intrinsic_i16:
88+
; CHECK: // %bb.0: // %entry
89+
; CHECK-NEXT: fcvtns h0, h0
90+
; CHECK-NEXT: fmov w0, s0
91+
; CHECK-NEXT: ret
92+
entry:
93+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtns.i16.f16(half %a)
94+
ret i16 %fcvt
95+
}
96+
97+
define i16 @fcvtnu_intrinsic_i16(half %a) {
98+
; CHECK-LABEL: fcvtnu_intrinsic_i16:
99+
; CHECK: // %bb.0: // %entry
100+
; CHECK-NEXT: fcvtnu h0, h0
101+
; CHECK-NEXT: fmov w0, s0
102+
; CHECK-NEXT: ret
103+
entry:
104+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtnu.i16.f16(half %a)
105+
ret i16 %fcvt
106+
}
107+
108+
define i16 @fcvtps_intrinsic_i16(half %a) {
109+
; CHECK-LABEL: fcvtps_intrinsic_i16:
110+
; CHECK: // %bb.0: // %entry
111+
; CHECK-NEXT: fcvtps h0, h0
112+
; CHECK-NEXT: fmov w0, s0
113+
; CHECK-NEXT: ret
114+
entry:
115+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtps.i16.f16(half %a)
116+
ret i16 %fcvt
117+
}
118+
119+
define i16 @fcvtpu_intrinsic_i16(half %a) {
120+
; CHECK-LABEL: fcvtpu_intrinsic_i16:
121+
; CHECK: // %bb.0: // %entry
122+
; CHECK-NEXT: fcvtpu h0, h0
123+
; CHECK-NEXT: fmov w0, s0
124+
; CHECK-NEXT: ret
125+
entry:
126+
%fcvt = tail call i16 @llvm.aarch64.neon.fcvtpu.i16.f16(half %a)
127+
ret i16 %fcvt
128+
}
Lines changed: 119 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,25 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
12
; RUN: llc < %s -mtriple=aarch64 -mattr=+v8.2a,+fullfp16 | FileCheck %s
23

34
declare <4 x half> @llvm.nearbyint.v4f16(<4 x half>)
45
declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
56
declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
67
declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
8+
declare <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half>)
9+
declare <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half>)
10+
declare <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half>)
11+
declare <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half>)
12+
declare <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half>)
13+
declare <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half>)
14+
declare <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half>)
15+
declare <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half>)
16+
declare <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half>)
17+
declare <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half>)
718

819
define dso_local <4 x half> @t_vrndi_f16(<4 x half> %a) {
920
; CHECK-LABEL: t_vrndi_f16:
10-
; CHECK: frinti v0.4h, v0.4h
21+
; CHECK: // %bb.0: // %entry
22+
; CHECK-NEXT: frinti v0.4h, v0.4h
1123
; CHECK-NEXT: ret
1224
entry:
1325
%vrndi1.i = tail call <4 x half> @llvm.nearbyint.v4f16(<4 x half> %a)
@@ -16,7 +28,8 @@ entry:
1628

1729
define dso_local <8 x half> @t_vrndiq_f16(<8 x half> %a) {
1830
; CHECK-LABEL: t_vrndiq_f16:
19-
; CHECK: frinti v0.8h, v0.8h
31+
; CHECK: // %bb.0: // %entry
32+
; CHECK-NEXT: frinti v0.8h, v0.8h
2033
; CHECK-NEXT: ret
2134
entry:
2235
%vrndi1.i = tail call <8 x half> @llvm.nearbyint.v8f16(<8 x half> %a)
@@ -25,7 +38,8 @@ entry:
2538

2639
define dso_local <4 x half> @t_vsqrt_f16(<4 x half> %a) {
2740
; CHECK-LABEL: t_vsqrt_f16:
28-
; CHECK: fsqrt v0.4h, v0.4h
41+
; CHECK: // %bb.0: // %entry
42+
; CHECK-NEXT: fsqrt v0.4h, v0.4h
2943
; CHECK-NEXT: ret
3044
entry:
3145
%vsqrt.i = tail call <4 x half> @llvm.sqrt.v4f16(<4 x half> %a)
@@ -34,9 +48,110 @@ entry:
3448

3549
define dso_local <8 x half> @t_vsqrtq_f16(<8 x half> %a) {
3650
; CHECK-LABEL: t_vsqrtq_f16:
37-
; CHECK: fsqrt v0.8h, v0.8h
51+
; CHECK: // %bb.0: // %entry
52+
; CHECK-NEXT: fsqrt v0.8h, v0.8h
3853
; CHECK-NEXT: ret
3954
entry:
4055
%vsqrt.i = tail call <8 x half> @llvm.sqrt.v8f16(<8 x half> %a)
4156
ret <8 x half> %vsqrt.i
4257
}
58+
59+
define <4 x i16> @t_fcvtzs_v4i16_v4f16(<4 x half> %a) {
60+
; CHECK-LABEL: t_fcvtzs_v4i16_v4f16:
61+
; CHECK: // %bb.0: // %entry
62+
; CHECK-NEXT: fcvtzs v0.4h, v0.4h
63+
; CHECK-NEXT: ret
64+
entry:
65+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzs.v4i16.v4f16(<4 x half> %a)
66+
ret <4 x i16> %vcvt
67+
}
68+
69+
define <4 x i16> @t_fcvtzu_v4i16_v4f16(<4 x half> %a) {
70+
; CHECK-LABEL: t_fcvtzu_v4i16_v4f16:
71+
; CHECK: // %bb.0: // %entry
72+
; CHECK-NEXT: fcvtzu v0.4h, v0.4h
73+
; CHECK-NEXT: ret
74+
entry:
75+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtzu.v4i16.v4f16(<4 x half> %a)
76+
ret <4 x i16> %vcvt
77+
}
78+
79+
define <4 x i16> @t_fcvtas_v4i16_v4f16(<4 x half> %a) {
80+
; CHECK-LABEL: t_fcvtas_v4i16_v4f16:
81+
; CHECK: // %bb.0: // %entry
82+
; CHECK-NEXT: fcvtas v0.4h, v0.4h
83+
; CHECK-NEXT: ret
84+
entry:
85+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtas.v4i16.v4f16(<4 x half> %a)
86+
ret <4 x i16> %vcvt
87+
}
88+
89+
define <4 x i16> @t_fcvtau_v4i16_v4f16(<4 x half> %a) {
90+
; CHECK-LABEL: t_fcvtau_v4i16_v4f16:
91+
; CHECK: // %bb.0: // %entry
92+
; CHECK-NEXT: fcvtau v0.4h, v0.4h
93+
; CHECK-NEXT: ret
94+
entry:
95+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtau.v4i16.v4f16(<4 x half> %a)
96+
ret <4 x i16> %vcvt
97+
}
98+
99+
define <4 x i16> @t_fcvtms_v4i16_v4f16(<4 x half> %a) {
100+
; CHECK-LABEL: t_fcvtms_v4i16_v4f16:
101+
; CHECK: // %bb.0: // %entry
102+
; CHECK-NEXT: fcvtms v0.4h, v0.4h
103+
; CHECK-NEXT: ret
104+
entry:
105+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtms.v4i16.v4f16(<4 x half> %a)
106+
ret <4 x i16> %vcvt
107+
}
108+
109+
define <4 x i16> @t_fcvtmu_v4i16_v4f16(<4 x half> %a) {
110+
; CHECK-LABEL: t_fcvtmu_v4i16_v4f16:
111+
; CHECK: // %bb.0: // %entry
112+
; CHECK-NEXT: fcvtmu v0.4h, v0.4h
113+
; CHECK-NEXT: ret
114+
entry:
115+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtmu.v4i16.v4f16(<4 x half> %a)
116+
ret <4 x i16> %vcvt
117+
}
118+
119+
define <4 x i16> @t_fcvtns_v4i16_v4f16(<4 x half> %a) {
120+
; CHECK-LABEL: t_fcvtns_v4i16_v4f16:
121+
; CHECK: // %bb.0: // %entry
122+
; CHECK-NEXT: fcvtns v0.4h, v0.4h
123+
; CHECK-NEXT: ret
124+
entry:
125+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtns.v4i16.v4f16(<4 x half> %a)
126+
ret <4 x i16> %vcvt
127+
}
128+
129+
define <4 x i16> @t_fcvtnu_v4i16_v4f16(<4 x half> %a) {
130+
; CHECK-LABEL: t_fcvtnu_v4i16_v4f16:
131+
; CHECK: // %bb.0: // %entry
132+
; CHECK-NEXT: fcvtnu v0.4h, v0.4h
133+
; CHECK-NEXT: ret
134+
entry:
135+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtnu.v4i16.v4f16(<4 x half> %a)
136+
ret <4 x i16> %vcvt
137+
}
138+
139+
define <4 x i16> @t_fcvtps_v4i16_v4f16(<4 x half> %a) {
140+
; CHECK-LABEL: t_fcvtps_v4i16_v4f16:
141+
; CHECK: // %bb.0: // %entry
142+
; CHECK-NEXT: fcvtps v0.4h, v0.4h
143+
; CHECK-NEXT: ret
144+
entry:
145+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtps.v4i16.v4f16(<4 x half> %a)
146+
ret <4 x i16> %vcvt
147+
}
148+
149+
define <4 x i16> @t_fcvtpu_v4i16_v4f16(<4 x half> %a) {
150+
; CHECK-LABEL: t_fcvtpu_v4i16_v4f16:
151+
; CHECK: // %bb.0: // %entry
152+
; CHECK-NEXT: fcvtpu v0.4h, v0.4h
153+
; CHECK-NEXT: ret
154+
entry:
155+
%vcvt = tail call <4 x i16> @llvm.aarch64.neon.fcvtpu.v4i16.v4f16(<4 x half> %a)
156+
ret <4 x i16> %vcvt
157+
}

0 commit comments

Comments
 (0)