Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5106,6 +5106,29 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op,
uint64_t VTSize = VT.getFixedSizeInBits();
uint64_t InVTSize = InVT.getFixedSizeInBits();
if (VTSize < InVTSize) {
// AArch64 doesn't have a direct vector instruction to convert
// fixed point to floating point AND narrow it at the same time.
// Additional rounding when the target is f32/f64 causes double
// rounding issues. Conversion to f16 is fine due to narrow width.
bool IsTargetf32 = VT.getVectorElementType() == MVT::f32;
bool IsTargetf16 = false;
if (Op.hasOneUse() &&
Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) {
// Some vector types are split during legalization into half, followed by
// concatenation, followed by rounding to the original vector type. If we
// end up resolving to f16 type, we shouldn't worry about rounding errors.
SDNode *U = *Op->user_begin();
if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) {
EVT TmpVT = U->user_begin()->getValueType(0);
if (TmpVT.getScalarType() == MVT::f16)
IsTargetf16 = true;
}
}

if (IsTargetf32 && !IsTargetf16) {
return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue();
}

MVT CastVT =
MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()),
InVT.getVectorNumElements());
Expand Down
32 changes: 19 additions & 13 deletions llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll
Original file line number Diff line number Diff line change
Expand Up @@ -148,9 +148,9 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: str xzr, [x0, #16]
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: mov.d v0[0], v1[0]
; CHECK-NEXT: ucvtf.2d v0, v0
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: fmov x8, d1
; CHECK-NEXT: ucvtf s1, x8
; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret

Expand All @@ -166,10 +166,11 @@ define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-LABEL: insert_vec_v2i64_uaddlv_from_v4i32:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: mov.d v0[0], v1[0]
; CHECK-NEXT: ucvtf.2d v0, v0
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: uaddlv.4s d0, v0
; CHECK-NEXT: fmov x8, d0
; CHECK-NEXT: movi d0, #0000000000000000
; CHECK-NEXT: ucvtf s1, x8
; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: str d0, [x0]
; CHECK-NEXT: ret

Expand All @@ -187,9 +188,9 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) {
; CHECK-NEXT: movi.2d v0, #0000000000000000
; CHECK-NEXT: str wzr, [x0, #16]
; CHECK-NEXT: uaddlv.4s d1, v0
; CHECK-NEXT: mov.d v0[0], v1[0]
; CHECK-NEXT: ucvtf.2d v0, v0
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: fmov x8, d1
; CHECK-NEXT: ucvtf s1, x8
; CHECK-NEXT: mov.s v0[0], v1[0]
; CHECK-NEXT: str q0, [x0]
; CHECK-NEXT: ret

Expand Down Expand Up @@ -254,9 +255,14 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) {
; CHECK-NEXT: uaddlv.4h s1, v0
; CHECK-NEXT: stp q0, q0, [x0, #32]
; CHECK-NEXT: mov.s v2[0], v1[0]
; CHECK-NEXT: ucvtf.2d v1, v2
; CHECK-NEXT: fcvtn v1.2s, v1.2d
; CHECK-NEXT: stp q1, q0, [x0]
; CHECK-NEXT: fmov x8, d2
; CHECK-NEXT: mov.d x9, v2[1]
; CHECK-NEXT: movi.2d v2, #0000000000000000
; CHECK-NEXT: ucvtf s1, x8
; CHECK-NEXT: ucvtf s3, x9
; CHECK-NEXT: mov.s v2[0], v1[0]
; CHECK-NEXT: mov.s v2[1], v3[0]
; CHECK-NEXT: stp q2, q0, [x0]
; CHECK-NEXT: ret

entry:
Expand Down
33 changes: 20 additions & 13 deletions llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll
Original file line number Diff line number Diff line change
Expand Up @@ -53,20 +53,27 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) {
define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) {
; CHECK-LABEL: uitofp_v4i64_to_v4bf16:
; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ldp q0, q2, [x0]
; CHECK-NEXT: mov x8, v0.d[1]
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: ucvtf s1, x9
; CHECK-NEXT: mov x9, v2.d[1]
; CHECK-NEXT: ucvtf s0, x8
; CHECK-NEXT: fmov x8, d2
; CHECK-NEXT: ucvtf s2, x8
; CHECK-NEXT: mov v1.s[1], v0.s[0]
; CHECK-NEXT: ucvtf s0, x9
; CHECK-NEXT: mov v1.s[2], v2.s[0]
; CHECK-NEXT: movi v2.4s, #127, msl #8
; CHECK-NEXT: ucvtf v0.2d, v0.2d
; CHECK-NEXT: ucvtf v1.2d, v1.2d
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: fcvtn2 v0.4s, v1.2d
; CHECK-NEXT: movi v1.4s, #1
; CHECK-NEXT: ushr v3.4s, v0.4s, #16
; CHECK-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s
; CHECK-NEXT: orr v0.4s, #64, lsl #16
; CHECK-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b
; CHECK-NEXT: mov v1.s[3], v0.s[0]
; CHECK-NEXT: movi v0.4s, #1
; CHECK-NEXT: ushr v3.4s, v1.4s, #16
; CHECK-NEXT: add v2.4s, v1.4s, v2.4s
; CHECK-NEXT: and v0.16b, v3.16b, v0.16b
; CHECK-NEXT: fcmeq v3.4s, v1.4s, v1.4s
; CHECK-NEXT: orr v1.4s, #64, lsl #16
; CHECK-NEXT: add v0.4s, v0.4s, v2.4s
; CHECK-NEXT: bif v0.16b, v1.16b, v3.16b
; CHECK-NEXT: shrn v0.4h, v0.4s, #16
; CHECK-NEXT: ret
%tmp1 = load <4 x i64>, ptr %ptr
Expand Down
100 changes: 64 additions & 36 deletions llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll
Original file line number Diff line number Diff line change
Expand Up @@ -310,29 +310,43 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 {
define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 {
; CHECK-CVT-LABEL: sitofp_i64:
; CHECK-CVT: // %bb.0:
; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d
; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d
; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
; CHECK-CVT-NEXT: movi v1.4s, #1
; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
; CHECK-CVT-NEXT: mov x8, v0.d[1]
; CHECK-CVT-NEXT: fmov x9, d0
; CHECK-CVT-NEXT: scvtf s2, x9
; CHECK-CVT-NEXT: mov x9, v1.d[1]
; CHECK-CVT-NEXT: scvtf s0, x8
; CHECK-CVT-NEXT: fmov x8, d1
; CHECK-CVT-NEXT: scvtf s1, x8
; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0]
; CHECK-CVT-NEXT: scvtf s0, x9
; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0]
; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0]
; CHECK-CVT-NEXT: movi v0.4s, #1
; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16
; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
; CHECK-CVT-NEXT: and v0.16b, v3.16b, v0.16b
; CHECK-CVT-NEXT: fcmeq v3.4s, v2.4s, v2.4s
; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-CVT-NEXT: bif v0.16b, v2.16b, v3.16b
; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
; CHECK-CVT-NEXT: ret
;
; CHECK-BF16-LABEL: sitofp_i64:
; CHECK-BF16: // %bb.0:
; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d
; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d
; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
; CHECK-BF16-NEXT: mov x8, v0.d[1]
; CHECK-BF16-NEXT: fmov x9, d0
; CHECK-BF16-NEXT: scvtf s2, x9
; CHECK-BF16-NEXT: mov x9, v1.d[1]
; CHECK-BF16-NEXT: scvtf s0, x8
; CHECK-BF16-NEXT: fmov x8, d1
; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0]
; CHECK-BF16-NEXT: scvtf s0, x8
; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0]
; CHECK-BF16-NEXT: scvtf s0, x9
; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0]
; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
; CHECK-BF16-NEXT: ret
%1 = sitofp <4 x i64> %a to <4 x bfloat>
ret <4 x bfloat> %1
Expand Down Expand Up @@ -413,29 +427,43 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 {
define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 {
; CHECK-CVT-LABEL: uitofp_i64:
; CHECK-CVT: // %bb.0:
; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d
; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d
; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8
; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d
; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d
; CHECK-CVT-NEXT: movi v1.4s, #1
; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16
; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s
; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b
; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s
; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16
; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s
; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b
; CHECK-CVT-NEXT: mov x8, v0.d[1]
; CHECK-CVT-NEXT: fmov x9, d0
; CHECK-CVT-NEXT: ucvtf s2, x9
; CHECK-CVT-NEXT: mov x9, v1.d[1]
; CHECK-CVT-NEXT: ucvtf s0, x8
; CHECK-CVT-NEXT: fmov x8, d1
; CHECK-CVT-NEXT: ucvtf s1, x8
; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0]
; CHECK-CVT-NEXT: ucvtf s0, x9
; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0]
; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8
; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0]
; CHECK-CVT-NEXT: movi v0.4s, #1
; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16
; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s
; CHECK-CVT-NEXT: and v0.16b, v3.16b, v0.16b
; CHECK-CVT-NEXT: fcmeq v3.4s, v2.4s, v2.4s
; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16
; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s
; CHECK-CVT-NEXT: bif v0.16b, v2.16b, v3.16b
; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16
; CHECK-CVT-NEXT: ret
;
; CHECK-BF16-LABEL: uitofp_i64:
; CHECK-BF16: // %bb.0:
; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d
; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d
; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d
; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d
; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s
; CHECK-BF16-NEXT: mov x8, v0.d[1]
; CHECK-BF16-NEXT: fmov x9, d0
; CHECK-BF16-NEXT: ucvtf s2, x9
; CHECK-BF16-NEXT: mov x9, v1.d[1]
; CHECK-BF16-NEXT: ucvtf s0, x8
; CHECK-BF16-NEXT: fmov x8, d1
; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0]
; CHECK-BF16-NEXT: ucvtf s0, x8
; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0]
; CHECK-BF16-NEXT: ucvtf s0, x9
; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0]
; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s
; CHECK-BF16-NEXT: ret
%1 = uitofp <4 x i64> %a to <4 x bfloat>
ret <4 x bfloat> %1
Expand Down
Loading