diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index d0f51b73a4a44..1c8e3afdfd718 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -5106,6 +5106,29 @@ SDValue AArch64TargetLowering::LowerVectorINT_TO_FP(SDValue Op, uint64_t VTSize = VT.getFixedSizeInBits(); uint64_t InVTSize = InVT.getFixedSizeInBits(); if (VTSize < InVTSize) { + // AArch64 doesn't have a direct vector instruction to convert + // fixed point to floating point AND narrow it at the same time. + // Additional rounding when the target is f32/f64 causes double + // rounding issues. Conversion to f16 is fine due to narrow width. + bool IsTargetf32 = VT.getVectorElementType() == MVT::f32; + bool IsTargetf16 = false; + if (Op.hasOneUse() && + Op->user_begin()->getOpcode() == ISD::CONCAT_VECTORS) { + // Some vector types are split during legalization into half, followed by + // concatenation, followed by rounding to the original vector type. If we + // end up resolving to f16 type, we shouldn't worry about rounding errors. + SDNode *U = *Op->user_begin(); + if (U->hasOneUse() && U->user_begin()->getOpcode() == ISD::FP_ROUND) { + EVT TmpVT = U->user_begin()->getValueType(0); + if (TmpVT.getScalarType() == MVT::f16) + IsTargetf16 = true; + } + } + + if (IsTargetf32 && !IsTargetf16) { + return !IsStrict ? DAG.UnrollVectorOp(Op.getNode()) : SDValue(); + } + MVT CastVT = MVT::getVectorVT(MVT::getFloatingPointVT(InVT.getScalarSizeInBits()), InVT.getVectorNumElements()); diff --git a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll index b357a24f892ff..91eda8d552397 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-neon-vector-insert-uaddlv.ll @@ -148,9 +148,9 @@ define void @insert_vec_v6i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: str xzr, [x0, #16] ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: mov.d v0[0], v1[0] -; CHECK-NEXT: ucvtf.2d v0, v0 -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -166,10 +166,11 @@ define void @insert_vec_v2i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-LABEL: insert_vec_v2i64_uaddlv_from_v4i32: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: movi.2d v0, #0000000000000000 -; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: mov.d v0[0], v1[0] -; CHECK-NEXT: ucvtf.2d v0, v0 -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: uaddlv.4s d0, v0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movi d0, #0000000000000000 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: str d0, [x0] ; CHECK-NEXT: ret @@ -187,9 +188,9 @@ define void @insert_vec_v5i64_uaddlv_from_v4i32(ptr %0) { ; CHECK-NEXT: movi.2d v0, #0000000000000000 ; CHECK-NEXT: str wzr, [x0, #16] ; CHECK-NEXT: uaddlv.4s d1, v0 -; CHECK-NEXT: mov.d v0[0], v1[0] -; CHECK-NEXT: ucvtf.2d v0, v0 -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: mov.s v0[0], v1[0] ; CHECK-NEXT: str q0, [x0] ; CHECK-NEXT: ret @@ -254,9 +255,14 @@ define void @insert_vec_v16i64_uaddlv_from_v4i16(ptr %0) { ; CHECK-NEXT: uaddlv.4h s1, v0 ; CHECK-NEXT: stp q0, q0, [x0, #32] ; CHECK-NEXT: mov.s v2[0], v1[0] -; CHECK-NEXT: ucvtf.2d v1, v2 -; CHECK-NEXT: fcvtn v1.2s, v1.2d -; CHECK-NEXT: stp q1, q0, [x0] +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov.d x9, v2[1] +; CHECK-NEXT: movi.2d v2, #0000000000000000 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: ucvtf s3, x9 +; CHECK-NEXT: mov.s v2[0], v1[0] +; CHECK-NEXT: mov.s v2[1], v3[0] +; CHECK-NEXT: stp q2, q0, [x0] ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll index 508f68d6f14d4..2b9e334cc7812 100644 --- a/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll +++ b/llvm/test/CodeGen/AArch64/arm64-convert-v4f64.ll @@ -53,20 +53,27 @@ define <4 x half> @uitofp_v4i64_to_v4f16(ptr %ptr) { define <4 x bfloat> @uitofp_v4i64_to_v4bf16(ptr %ptr) { ; CHECK-LABEL: uitofp_v4i64_to_v4bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: ldp q0, q1, [x0] +; CHECK-NEXT: ldp q0, q2, [x0] +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ucvtf s1, x9 +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: ucvtf s2, x8 +; CHECK-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEXT: ucvtf s0, x9 +; CHECK-NEXT: mov v1.s[2], v2.s[0] ; CHECK-NEXT: movi v2.4s, #127, msl #8 -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: mov v1.s[3], v0.s[0] +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ushr v3.4s, v1.4s, #16 +; CHECK-NEXT: add v2.4s, v1.4s, v2.4s +; CHECK-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-NEXT: fcmeq v3.4s, v1.4s, v1.4s +; CHECK-NEXT: orr v1.4s, #64, lsl #16 +; CHECK-NEXT: add v0.4s, v0.4s, v2.4s +; CHECK-NEXT: bif v0.16b, v1.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret %tmp1 = load <4 x i64>, ptr %ptr diff --git a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll index 1cd0294b0083e..e185da3093645 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v4-instructions.ll @@ -310,29 +310,43 @@ define <4 x bfloat> @sitofp_i32(<4 x i32> %a) #0 { define <4 x bfloat> @sitofp_i64(<4 x i64> %a) #0 { ; CHECK-CVT-LABEL: sitofp_i64: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d -; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d -; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-CVT-NEXT: mov x8, v0.d[1] +; CHECK-CVT-NEXT: fmov x9, d0 +; CHECK-CVT-NEXT: scvtf s2, x9 +; CHECK-CVT-NEXT: mov x9, v1.d[1] +; CHECK-CVT-NEXT: scvtf s0, x8 +; CHECK-CVT-NEXT: fmov x8, d1 +; CHECK-CVT-NEXT: scvtf s1, x8 +; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0] +; CHECK-CVT-NEXT: scvtf s0, x9 +; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0] +; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8 +; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0] +; CHECK-CVT-NEXT: movi v0.4s, #1 +; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-CVT-NEXT: fcmeq v3.4s, v2.4s, v2.4s +; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-CVT-NEXT: bif v0.16b, v2.16b, v3.16b ; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-CVT-NEXT: ret ; ; CHECK-BF16-LABEL: sitofp_i64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d -; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d -; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-NEXT: mov x8, v0.d[1] +; CHECK-BF16-NEXT: fmov x9, d0 +; CHECK-BF16-NEXT: scvtf s2, x9 +; CHECK-BF16-NEXT: mov x9, v1.d[1] +; CHECK-BF16-NEXT: scvtf s0, x8 +; CHECK-BF16-NEXT: fmov x8, d1 +; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0] +; CHECK-BF16-NEXT: scvtf s0, x8 +; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0] +; CHECK-BF16-NEXT: scvtf s0, x9 +; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s ; CHECK-BF16-NEXT: ret %1 = sitofp <4 x i64> %a to <4 x bfloat> ret <4 x bfloat> %1 @@ -413,29 +427,43 @@ define <4 x bfloat> @uitofp_i32(<4 x i32> %a) #0 { define <4 x bfloat> @uitofp_i64(<4 x i64> %a) #0 { ; CHECK-CVT-LABEL: uitofp_i64: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d -; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d -; CHECK-CVT-NEXT: movi v2.4s, #127, msl #8 -; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d -; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-CVT-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-CVT-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-CVT-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-CVT-NEXT: mov x8, v0.d[1] +; CHECK-CVT-NEXT: fmov x9, d0 +; CHECK-CVT-NEXT: ucvtf s2, x9 +; CHECK-CVT-NEXT: mov x9, v1.d[1] +; CHECK-CVT-NEXT: ucvtf s0, x8 +; CHECK-CVT-NEXT: fmov x8, d1 +; CHECK-CVT-NEXT: ucvtf s1, x8 +; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0] +; CHECK-CVT-NEXT: ucvtf s0, x9 +; CHECK-CVT-NEXT: mov v2.s[2], v1.s[0] +; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8 +; CHECK-CVT-NEXT: mov v2.s[3], v0.s[0] +; CHECK-CVT-NEXT: movi v0.4s, #1 +; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-CVT-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-CVT-NEXT: fcmeq v3.4s, v2.4s, v2.4s +; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-CVT-NEXT: bif v0.16b, v2.16b, v3.16b ; CHECK-CVT-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-CVT-NEXT: ret ; ; CHECK-BF16-LABEL: uitofp_i64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d -; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d -; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s +; CHECK-BF16-NEXT: mov x8, v0.d[1] +; CHECK-BF16-NEXT: fmov x9, d0 +; CHECK-BF16-NEXT: ucvtf s2, x9 +; CHECK-BF16-NEXT: mov x9, v1.d[1] +; CHECK-BF16-NEXT: ucvtf s0, x8 +; CHECK-BF16-NEXT: fmov x8, d1 +; CHECK-BF16-NEXT: mov v2.s[1], v0.s[0] +; CHECK-BF16-NEXT: ucvtf s0, x8 +; CHECK-BF16-NEXT: mov v2.s[2], v0.s[0] +; CHECK-BF16-NEXT: ucvtf s0, x9 +; CHECK-BF16-NEXT: mov v2.s[3], v0.s[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v2.4s ; CHECK-BF16-NEXT: ret %1 = uitofp <4 x i64> %a to <4 x bfloat> ret <4 x bfloat> %1 diff --git a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll index 2eaa58de92807..3a55b68f2d1a3 100644 --- a/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll +++ b/llvm/test/CodeGen/AArch64/bf16-v8-instructions.ll @@ -489,45 +489,74 @@ define <8 x bfloat> @sitofp_i32(<8 x i32> %a) #0 { define <8 x bfloat> @sitofp_i64(<8 x i64> %a) #0 { ; CHECK-CVT-LABEL: sitofp_i64: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: scvtf v2.2d, v2.2d -; CHECK-CVT-NEXT: scvtf v0.2d, v0.2d -; CHECK-CVT-NEXT: scvtf v3.2d, v3.2d -; CHECK-CVT-NEXT: scvtf v1.2d, v1.2d -; CHECK-CVT-NEXT: fcvtn v2.2s, v2.2d -; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d -; CHECK-CVT-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8 -; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s -; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s -; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-NEXT: fmov x10, d2 +; CHECK-CVT-NEXT: mov x8, v2.d[1] +; CHECK-CVT-NEXT: mov x9, v0.d[1] +; CHECK-CVT-NEXT: scvtf s2, x10 +; CHECK-CVT-NEXT: fmov x10, d0 +; CHECK-CVT-NEXT: scvtf s0, x8 +; CHECK-CVT-NEXT: scvtf s5, x9 +; CHECK-CVT-NEXT: fmov x9, d3 +; CHECK-CVT-NEXT: mov x8, v3.d[1] +; CHECK-CVT-NEXT: scvtf s4, x10 +; CHECK-CVT-NEXT: fmov x10, d1 +; CHECK-CVT-NEXT: scvtf s3, x9 +; CHECK-CVT-NEXT: mov x9, v1.d[1] +; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0] +; CHECK-CVT-NEXT: scvtf s0, x10 +; CHECK-CVT-NEXT: scvtf s1, x8 +; CHECK-CVT-NEXT: mov v4.s[1], v5.s[0] +; CHECK-CVT-NEXT: mov v2.s[2], v3.s[0] +; CHECK-CVT-NEXT: scvtf s3, x9 +; CHECK-CVT-NEXT: mov v4.s[2], v0.s[0] +; CHECK-CVT-NEXT: movi v0.4s, #1 +; CHECK-CVT-NEXT: mov v2.s[3], v1.s[0] +; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8 +; CHECK-CVT-NEXT: mov v4.s[3], v3.s[0] +; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-NEXT: add v6.4s, v2.4s, v1.4s +; CHECK-CVT-NEXT: ushr v5.4s, v4.4s, #16 +; CHECK-CVT-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-CVT-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-CVT-NEXT: and v0.16b, v5.16b, v0.16b ; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b -; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-CVT-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-CVT-NEXT: fcmeq v6.4s, v4.4s, v4.4s +; CHECK-CVT-NEXT: orr v4.4s, #64, lsl #16 +; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-CVT-NEXT: mov v1.16b, v5.16b +; CHECK-CVT-NEXT: bif v0.16b, v4.16b, v6.16b +; CHECK-CVT-NEXT: bsl v1.16b, v3.16b, v2.16b +; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v1.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-BF16-LABEL: sitofp_i64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: scvtf v0.2d, v0.2d -; CHECK-BF16-NEXT: scvtf v2.2d, v2.2d -; CHECK-BF16-NEXT: scvtf v1.2d, v1.2d -; CHECK-BF16-NEXT: scvtf v3.2d, v3.2d -; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d -; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-NEXT: mov x9, v0.d[1] +; CHECK-BF16-NEXT: fmov x10, d0 +; CHECK-BF16-NEXT: mov x8, v2.d[1] +; CHECK-BF16-NEXT: scvtf s4, x10 +; CHECK-BF16-NEXT: fmov x10, d1 +; CHECK-BF16-NEXT: scvtf s0, x9 +; CHECK-BF16-NEXT: fmov x9, d2 +; CHECK-BF16-NEXT: scvtf s2, x8 +; CHECK-BF16-NEXT: mov x8, v1.d[1] +; CHECK-BF16-NEXT: scvtf s1, x9 +; CHECK-BF16-NEXT: fmov x9, d3 +; CHECK-BF16-NEXT: mov v4.s[1], v0.s[0] +; CHECK-BF16-NEXT: scvtf s0, x10 +; CHECK-BF16-NEXT: mov x10, v3.d[1] +; CHECK-BF16-NEXT: scvtf s3, x9 +; CHECK-BF16-NEXT: mov v1.s[1], v2.s[0] +; CHECK-BF16-NEXT: scvtf s2, x8 +; CHECK-BF16-NEXT: mov v4.s[2], v0.s[0] +; CHECK-BF16-NEXT: scvtf s0, x10 +; CHECK-BF16-NEXT: mov v1.s[2], v3.s[0] +; CHECK-BF16-NEXT: mov v4.s[3], v2.s[0] +; CHECK-BF16-NEXT: mov v1.s[3], v0.s[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v4.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = sitofp <8 x i64> %a to <8 x bfloat> ret <8 x bfloat> %1 @@ -712,45 +741,74 @@ define <8 x bfloat> @uitofp_i32(<8 x i32> %a) #0 { define <8 x bfloat> @uitofp_i64(<8 x i64> %a) #0 { ; CHECK-CVT-LABEL: uitofp_i64: ; CHECK-CVT: // %bb.0: -; CHECK-CVT-NEXT: ucvtf v2.2d, v2.2d -; CHECK-CVT-NEXT: ucvtf v0.2d, v0.2d -; CHECK-CVT-NEXT: ucvtf v3.2d, v3.2d -; CHECK-CVT-NEXT: ucvtf v1.2d, v1.2d -; CHECK-CVT-NEXT: fcvtn v2.2s, v2.2d -; CHECK-CVT-NEXT: fcvtn v0.2s, v0.2d -; CHECK-CVT-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-CVT-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-CVT-NEXT: movi v1.4s, #1 -; CHECK-CVT-NEXT: movi v3.4s, #127, msl #8 -; CHECK-CVT-NEXT: ushr v4.4s, v2.4s, #16 -; CHECK-CVT-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-CVT-NEXT: add v6.4s, v2.4s, v3.4s -; CHECK-CVT-NEXT: add v3.4s, v0.4s, v3.4s -; CHECK-CVT-NEXT: and v4.16b, v4.16b, v1.16b -; CHECK-CVT-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-CVT-NEXT: fmov x10, d2 +; CHECK-CVT-NEXT: mov x8, v2.d[1] +; CHECK-CVT-NEXT: mov x9, v0.d[1] +; CHECK-CVT-NEXT: ucvtf s2, x10 +; CHECK-CVT-NEXT: fmov x10, d0 +; CHECK-CVT-NEXT: ucvtf s0, x8 +; CHECK-CVT-NEXT: ucvtf s5, x9 +; CHECK-CVT-NEXT: fmov x9, d3 +; CHECK-CVT-NEXT: mov x8, v3.d[1] +; CHECK-CVT-NEXT: ucvtf s4, x10 +; CHECK-CVT-NEXT: fmov x10, d1 +; CHECK-CVT-NEXT: ucvtf s3, x9 +; CHECK-CVT-NEXT: mov x9, v1.d[1] +; CHECK-CVT-NEXT: mov v2.s[1], v0.s[0] +; CHECK-CVT-NEXT: ucvtf s0, x10 +; CHECK-CVT-NEXT: ucvtf s1, x8 +; CHECK-CVT-NEXT: mov v4.s[1], v5.s[0] +; CHECK-CVT-NEXT: mov v2.s[2], v3.s[0] +; CHECK-CVT-NEXT: ucvtf s3, x9 +; CHECK-CVT-NEXT: mov v4.s[2], v0.s[0] +; CHECK-CVT-NEXT: movi v0.4s, #1 +; CHECK-CVT-NEXT: mov v2.s[3], v1.s[0] +; CHECK-CVT-NEXT: movi v1.4s, #127, msl #8 +; CHECK-CVT-NEXT: mov v4.s[3], v3.s[0] +; CHECK-CVT-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-CVT-NEXT: add v6.4s, v2.4s, v1.4s +; CHECK-CVT-NEXT: ushr v5.4s, v4.4s, #16 +; CHECK-CVT-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-CVT-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-CVT-NEXT: and v0.16b, v5.16b, v0.16b ; CHECK-CVT-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-CVT-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-CVT-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-CVT-NEXT: fcmeq v6.4s, v0.4s, v0.4s -; CHECK-CVT-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-CVT-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-CVT-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-CVT-NEXT: bit v0.16b, v1.16b, v6.16b -; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-CVT-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-CVT-NEXT: fcmeq v6.4s, v4.4s, v4.4s +; CHECK-CVT-NEXT: orr v4.4s, #64, lsl #16 +; CHECK-CVT-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-CVT-NEXT: mov v1.16b, v5.16b +; CHECK-CVT-NEXT: bif v0.16b, v4.16b, v6.16b +; CHECK-CVT-NEXT: bsl v1.16b, v3.16b, v2.16b +; CHECK-CVT-NEXT: uzp2 v0.8h, v0.8h, v1.8h ; CHECK-CVT-NEXT: ret ; ; CHECK-BF16-LABEL: uitofp_i64: ; CHECK-BF16: // %bb.0: -; CHECK-BF16-NEXT: ucvtf v0.2d, v0.2d -; CHECK-BF16-NEXT: ucvtf v2.2d, v2.2d -; CHECK-BF16-NEXT: ucvtf v1.2d, v1.2d -; CHECK-BF16-NEXT: ucvtf v3.2d, v3.2d -; CHECK-BF16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-BF16-NEXT: fcvtn v2.2s, v2.2d -; CHECK-BF16-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-BF16-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-BF16-NEXT: bfcvtn v0.4h, v0.4s -; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v2.4s +; CHECK-BF16-NEXT: mov x9, v0.d[1] +; CHECK-BF16-NEXT: fmov x10, d0 +; CHECK-BF16-NEXT: mov x8, v2.d[1] +; CHECK-BF16-NEXT: ucvtf s4, x10 +; CHECK-BF16-NEXT: fmov x10, d1 +; CHECK-BF16-NEXT: ucvtf s0, x9 +; CHECK-BF16-NEXT: fmov x9, d2 +; CHECK-BF16-NEXT: ucvtf s2, x8 +; CHECK-BF16-NEXT: mov x8, v1.d[1] +; CHECK-BF16-NEXT: ucvtf s1, x9 +; CHECK-BF16-NEXT: fmov x9, d3 +; CHECK-BF16-NEXT: mov v4.s[1], v0.s[0] +; CHECK-BF16-NEXT: ucvtf s0, x10 +; CHECK-BF16-NEXT: mov x10, v3.d[1] +; CHECK-BF16-NEXT: ucvtf s3, x9 +; CHECK-BF16-NEXT: mov v1.s[1], v2.s[0] +; CHECK-BF16-NEXT: ucvtf s2, x8 +; CHECK-BF16-NEXT: mov v4.s[2], v0.s[0] +; CHECK-BF16-NEXT: ucvtf s0, x10 +; CHECK-BF16-NEXT: mov v1.s[2], v3.s[0] +; CHECK-BF16-NEXT: mov v4.s[3], v2.s[0] +; CHECK-BF16-NEXT: mov v1.s[3], v0.s[0] +; CHECK-BF16-NEXT: bfcvtn v0.4h, v4.4s +; CHECK-BF16-NEXT: bfcvtn2 v0.8h, v1.4s ; CHECK-BF16-NEXT: ret %1 = uitofp <8 x i64> %a to <8 x bfloat> ret <8 x bfloat> %1 diff --git a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll index ec504b4782547..baca159f9dd55 100644 --- a/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/complex-int-to-fp.ll @@ -5,9 +5,12 @@ define void @autogen_SD19655(ptr %addr, ptr %addrfloat) { ; CHECK-LABEL: autogen_SD19655: ; CHECK: // %bb.0: ; CHECK-NEXT: ldr q0, [x0] -; CHECK-NEXT: scvtf.2d v0, v0 -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: str d0, [x1] +; CHECK-NEXT: mov.d x8, v0[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: scvtf s1, x9 +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: mov.s v1[1], v0[0] +; CHECK-NEXT: str d1, [x1] ; CHECK-NEXT: ret %T = load <2 x i64>, ptr %addr %F = sitofp <2 x i64> %T to <2 x float> @@ -88,8 +91,12 @@ define <2 x double> @test_unsigned_v2i8_to_v2f64(<2 x i8> %v) nounwind readnone define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone { ; CHECK-LABEL: test_signed_v2i64_to_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf.2d v0, v0 -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: mov.d x8, v0[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: scvtf s0, x9 +; CHECK-NEXT: scvtf s1, x8 +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %conv = sitofp <2 x i64> %v to <2 x float> @@ -98,8 +105,12 @@ define <2 x float> @test_signed_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone { define <2 x float> @test_unsigned_v2i64_to_v2f32(<2 x i64> %v) nounwind readnone { ; CHECK-LABEL: test_unsigned_v2i64_to_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf.2d v0, v0 -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: mov.d x8, v0[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ucvtf s0, x9 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: mov.s v0[1], v1[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %conv = uitofp <2 x i64> %v to <2 x float> diff --git a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll index b40c0656a60e4..b65334e2461fd 100644 --- a/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fold-int-pow2-with-fmul-or-fdiv.ll @@ -262,10 +262,13 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; CHECK-NEON-NEXT: mov w8, #2 // =0x2 ; CHECK-NEON-NEXT: dup v1.2d, x8 ; CHECK-NEON-NEXT: ushl v0.2d, v1.2d, v0.2d -; CHECK-NEON-NEXT: fmov v1.2s, #15.00000000 -; CHECK-NEON-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEON-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEON-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-NEON-NEXT: mov x8, v0.d[1] +; CHECK-NEON-NEXT: fmov x9, d0 +; CHECK-NEON-NEXT: ucvtf s1, x9 +; CHECK-NEON-NEXT: ucvtf s0, x8 +; CHECK-NEON-NEXT: mov v1.s[1], v0.s[0] +; CHECK-NEON-NEXT: fmov v0.2s, #15.00000000 +; CHECK-NEON-NEXT: fmul v0.2s, v1.2s, v0.2s ; CHECK-NEON-NEXT: ret ; ; CHECK-NO-NEON-LABEL: fmul_pow_shl_cnt_vec_fail_expensive_cast: diff --git a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll index 83e60c1089762..1364c47adff2d 100644 --- a/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll +++ b/llvm/test/CodeGen/AArch64/fp-intrinsics-vector.ll @@ -193,10 +193,17 @@ define <4 x float> @uitofp_v4f32_v4i32(<4 x i32> %x) #0 { define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-LABEL: sitofp_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: scvtf s0, x9 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: scvtf s2, x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: scvtf s1, x8 +; CHECK-NEXT: mov v0.s[1], v2.s[0] +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: scvtf s1, x9 +; CHECK-NEXT: mov v0.s[3], v1.s[0] ; CHECK-NEXT: ret %val = call <4 x float> @llvm.experimental.constrained.sitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 ret <4 x float> %val @@ -205,10 +212,38 @@ define <4 x float> @sitofp_v4f32_v4i64(<4 x i64> %x) #0 { define <4 x float> @uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; CHECK-LABEL: uitofp_v4f32_v4i64: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-NEXT: movi v2.2d, #0x000000ffffffff +; CHECK-NEXT: ushr v3.2d, v1.2d, #32 +; CHECK-NEXT: ushr v4.2d, v0.2d, #32 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: mov x9, v4.d[1] +; CHECK-NEXT: fmov x10, d3 +; CHECK-NEXT: and v1.16b, v1.16b, v2.16b +; CHECK-NEXT: and v0.16b, v0.16b, v2.16b +; CHECK-NEXT: fmov x11, d4 +; CHECK-NEXT: scvtf s2, x10 +; CHECK-NEXT: mov x10, v1.d[1] +; CHECK-NEXT: scvtf s3, x8 +; CHECK-NEXT: scvtf s4, x11 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: scvtf s5, x9 +; CHECK-NEXT: mov w9, #1333788672 // =0x4f800000 +; CHECK-NEXT: fmov x11, d1 +; CHECK-NEXT: dup v1.2s, w9 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: scvtf s0, x10 +; CHECK-NEXT: mov v2.s[1], v3.s[0] +; CHECK-NEXT: scvtf s6, x11 +; CHECK-NEXT: scvtf s3, x8 +; CHECK-NEXT: mov v4.s[1], v5.s[0] +; CHECK-NEXT: scvtf s5, x9 +; CHECK-NEXT: mov v6.s[1], v0.s[0] +; CHECK-NEXT: fmul v0.2s, v2.2s, v1.2s +; CHECK-NEXT: fmul v1.2s, v4.2s, v1.2s +; CHECK-NEXT: mov v5.s[1], v3.s[0] +; CHECK-NEXT: fadd v2.2s, v0.2s, v6.2s +; CHECK-NEXT: fadd v0.2s, v1.2s, v5.2s +; CHECK-NEXT: mov v0.d[1], v2.d[0] ; CHECK-NEXT: ret %val = call <4 x float> @llvm.experimental.constrained.uitofp.v4f32.v4i64(<4 x i64> %x, metadata !"round.tonearest", metadata !"fpexcept.strict") #0 ret <4 x float> %val diff --git a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll index 0a7319b9ce11e..9da6f583cec01 100644 --- a/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll +++ b/llvm/test/CodeGen/AArch64/fprcvt-cvtf.ll @@ -210,15 +210,20 @@ define <1 x float> @scvtf_f32i64_simple(<1 x i64> %x) { ; CHECK-LABEL: scvtf_f32i64_simple: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: scvtf s0, d0 +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret ; ; CHECK-NO-FPRCVT-LABEL: scvtf_f32i64_simple: ; CHECK-NO-FPRCVT: // %bb.0: ; CHECK-NO-FPRCVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NO-FPRCVT-NEXT: scvtf v0.2d, v0.2d -; CHECK-NO-FPRCVT-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NO-FPRCVT-NEXT: fmov x8, d0 +; CHECK-NO-FPRCVT-NEXT: movi d1, #0000000000000000 +; CHECK-NO-FPRCVT-NEXT: scvtf s0, x8 +; CHECK-NO-FPRCVT-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NO-FPRCVT-NEXT: fmov d0, d1 ; CHECK-NO-FPRCVT-NEXT: ret %conv = sitofp <1 x i64> %x to <1 x float> ret <1 x float> %conv @@ -426,15 +431,20 @@ define <1 x float> @ucvtf_f32i64_simple(<1 x i64> %x) { ; CHECK-LABEL: ucvtf_f32i64_simple: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: ucvtf s0, d0 +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret ; ; CHECK-NO-FPRCVT-LABEL: ucvtf_f32i64_simple: ; CHECK-NO-FPRCVT: // %bb.0: ; CHECK-NO-FPRCVT-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NO-FPRCVT-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NO-FPRCVT-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NO-FPRCVT-NEXT: fmov x8, d0 +; CHECK-NO-FPRCVT-NEXT: movi d1, #0000000000000000 +; CHECK-NO-FPRCVT-NEXT: ucvtf s0, x8 +; CHECK-NO-FPRCVT-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NO-FPRCVT-NEXT: fmov d0, d1 ; CHECK-NO-FPRCVT-NEXT: ret %conv = uitofp <1 x i64> %x to <1 x float> ret <1 x float> %conv diff --git a/llvm/test/CodeGen/AArch64/itofp-bf16.ll b/llvm/test/CodeGen/AArch64/itofp-bf16.ll index 58591b11c184f..42641693c4081 100644 --- a/llvm/test/CodeGen/AArch64/itofp-bf16.ll +++ b/llvm/test/CodeGen/AArch64/itofp-bf16.ll @@ -349,22 +349,27 @@ define <3 x bfloat> @stofp_v3i64_v3bf16(<3 x i64> %a) { ; CHECK-LABEL: stofp_v3i64_v3bf16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: scvtf v1.2d, v2.2d -; CHECK-NEXT: movi v2.4s, #127, msl #8 -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: scvtf s1, x8 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov v3.s[0], v0.s[0] +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: mov v3.s[1], v1.s[0] +; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: mov v3.s[2], v0.s[0] +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ushr v2.4s, v3.4s, #16 +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: and v0.16b, v2.16b, v0.16b +; CHECK-NEXT: fcmeq v2.4s, v3.4s, v3.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: bif v0.16b, v3.16b, v2.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -376,22 +381,27 @@ define <3 x bfloat> @utofp_v3i64_v3bf16(<3 x i64> %a) { ; CHECK-LABEL: utofp_v3i64_v3bf16: ; CHECK: // %bb.0: // %entry ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movi v3.2d, #0000000000000000 ; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-NEXT: mov v0.d[1], v1.d[0] -; CHECK-NEXT: ucvtf v1.2d, v2.2d -; CHECK-NEXT: movi v2.4s, #127, msl #8 -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov v3.s[0], v0.s[0] +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: mov v3.s[1], v1.s[0] +; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: mov v3.s[2], v0.s[0] +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ushr v2.4s, v3.4s, #16 +; CHECK-NEXT: add v1.4s, v3.4s, v1.4s +; CHECK-NEXT: and v0.16b, v2.16b, v0.16b +; CHECK-NEXT: fcmeq v2.4s, v3.4s, v3.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: bif v0.16b, v3.16b, v2.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -402,19 +412,26 @@ entry: define <4 x bfloat> @stofp_v4i64_v4bf16(<4 x i64> %a) { ; CHECK-LABEL: stofp_v4i64_v4bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: movi v2.4s, #127, msl #8 -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: scvtf s2, x9 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: scvtf s1, x8 +; CHECK-NEXT: mov v2.s[1], v0.s[0] +; CHECK-NEXT: scvtf s0, x9 +; CHECK-NEXT: mov v2.s[2], v1.s[0] +; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: mov v2.s[3], v0.s[0] +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-NEXT: fcmeq v3.4s, v2.4s, v2.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -425,19 +442,26 @@ entry: define <4 x bfloat> @utofp_v4i64_v4bf16(<4 x i64> %a) { ; CHECK-LABEL: utofp_v4i64_v4bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: movi v2.4s, #127, msl #8 -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: ushr v3.4s, v0.4s, #16 -; CHECK-NEXT: add v2.4s, v0.4s, v2.4s -; CHECK-NEXT: and v1.16b, v3.16b, v1.16b -; CHECK-NEXT: fcmeq v3.4s, v0.4s, v0.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: add v1.4s, v1.4s, v2.4s -; CHECK-NEXT: bit v0.16b, v1.16b, v3.16b +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ucvtf s2, x9 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: fmov x8, d1 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: mov v2.s[1], v0.s[0] +; CHECK-NEXT: ucvtf s0, x9 +; CHECK-NEXT: mov v2.s[2], v1.s[0] +; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: mov v2.s[3], v0.s[0] +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-NEXT: add v1.4s, v2.4s, v1.4s +; CHECK-NEXT: and v0.16b, v3.16b, v0.16b +; CHECK-NEXT: fcmeq v3.4s, v2.4s, v2.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: bif v0.16b, v2.16b, v3.16b ; CHECK-NEXT: shrn v0.4h, v0.4s, #16 ; CHECK-NEXT: ret entry: @@ -448,31 +472,46 @@ entry: define <8 x bfloat> @stofp_v8i64_v8bf16(<8 x i64> %a) { ; CHECK-LABEL: stofp_v8i64_v8bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v3.2d, v3.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: ushr v4.4s, v2.4s, #16 -; CHECK-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-NEXT: add v6.4s, v2.4s, v3.4s -; CHECK-NEXT: add v3.4s, v0.4s, v3.4s -; CHECK-NEXT: and v4.16b, v4.16b, v1.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: scvtf s2, x10 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: scvtf s5, x9 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: scvtf s4, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: scvtf s3, x9 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: mov v2.s[1], v0.s[0] +; CHECK-NEXT: scvtf s0, x10 +; CHECK-NEXT: scvtf s1, x8 +; CHECK-NEXT: mov v4.s[1], v5.s[0] +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: scvtf s3, x9 +; CHECK-NEXT: mov v4.s[2], v0.s[0] +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: mov v2.s[3], v1.s[0] +; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: mov v4.s[3], v3.s[0] +; CHECK-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-NEXT: add v6.4s, v2.4s, v1.4s +; CHECK-NEXT: ushr v5.4s, v4.4s, #16 +; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-NEXT: and v0.16b, v5.16b, v0.16b ; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-NEXT: fcmeq v6.4s, v4.4s, v4.4s +; CHECK-NEXT: orr v4.4s, #64, lsl #16 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v1.16b, v5.16b +; CHECK-NEXT: bif v0.16b, v4.16b, v6.16b +; CHECK-NEXT: bsl v1.16b, v3.16b, v2.16b +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret entry: %c = sitofp <8 x i64> %a to <8 x bfloat> @@ -482,31 +521,46 @@ entry: define <8 x bfloat> @utofp_v8i64_v8bf16(<8 x i64> %a) { ; CHECK-LABEL: utofp_v8i64_v8bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: ushr v4.4s, v2.4s, #16 -; CHECK-NEXT: ushr v5.4s, v0.4s, #16 -; CHECK-NEXT: add v6.4s, v2.4s, v3.4s -; CHECK-NEXT: add v3.4s, v0.4s, v3.4s -; CHECK-NEXT: and v4.16b, v4.16b, v1.16b -; CHECK-NEXT: and v1.16b, v5.16b, v1.16b +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: ucvtf s2, x10 +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: ucvtf s5, x9 +; CHECK-NEXT: fmov x9, d3 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ucvtf s4, x10 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: ucvtf s3, x9 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: mov v2.s[1], v0.s[0] +; CHECK-NEXT: ucvtf s0, x10 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: mov v4.s[1], v5.s[0] +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: ucvtf s3, x9 +; CHECK-NEXT: mov v4.s[2], v0.s[0] +; CHECK-NEXT: movi v0.4s, #1 +; CHECK-NEXT: mov v2.s[3], v1.s[0] +; CHECK-NEXT: movi v1.4s, #127, msl #8 +; CHECK-NEXT: mov v4.s[3], v3.s[0] +; CHECK-NEXT: ushr v3.4s, v2.4s, #16 +; CHECK-NEXT: add v6.4s, v2.4s, v1.4s +; CHECK-NEXT: ushr v5.4s, v4.4s, #16 +; CHECK-NEXT: add v1.4s, v4.4s, v1.4s +; CHECK-NEXT: and v3.16b, v3.16b, v0.16b +; CHECK-NEXT: and v0.16b, v5.16b, v0.16b ; CHECK-NEXT: fcmeq v5.4s, v2.4s, v2.4s ; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v4.4s, v4.4s, v6.4s -; CHECK-NEXT: fcmeq v6.4s, v0.4s, v0.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v4.16b, v5.16b -; CHECK-NEXT: bit v0.16b, v1.16b, v6.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-NEXT: fcmeq v6.4s, v4.4s, v4.4s +; CHECK-NEXT: orr v4.4s, #64, lsl #16 +; CHECK-NEXT: add v0.4s, v0.4s, v1.4s +; CHECK-NEXT: mov v1.16b, v5.16b +; CHECK-NEXT: bif v0.16b, v4.16b, v6.16b +; CHECK-NEXT: bsl v1.16b, v3.16b, v2.16b +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v1.8h ; CHECK-NEXT: ret entry: %c = uitofp <8 x i64> %a to <8 x bfloat> @@ -516,55 +570,82 @@ entry: define <16 x bfloat> @stofp_v16i64_v16bf16(<16 x i64> %a) { ; CHECK-LABEL: stofp_v16i64_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: scvtf v6.2d, v6.2d -; CHECK-NEXT: scvtf v4.2d, v4.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: scvtf v3.2d, v3.2d -; CHECK-NEXT: scvtf v7.2d, v7.2d -; CHECK-NEXT: scvtf v5.2d, v5.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: fcvtn v6.2s, v6.2d -; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-NEXT: fcvtn2 v6.4s, v7.2d -; CHECK-NEXT: fcvtn2 v4.4s, v5.2d +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov x12, v6.d[1] +; CHECK-NEXT: scvtf s2, x11 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: scvtf s16, x8 +; CHECK-NEXT: fmov x8, d6 +; CHECK-NEXT: scvtf s0, x10 +; CHECK-NEXT: mov x10, v4.d[1] +; CHECK-NEXT: scvtf s17, x9 +; CHECK-NEXT: mov x9, v3.d[1] +; CHECK-NEXT: scvtf s6, x12 +; CHECK-NEXT: fmov x12, d4 +; CHECK-NEXT: scvtf s4, x11 +; CHECK-NEXT: scvtf s3, x8 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: mov v0.s[1], v16.s[0] +; CHECK-NEXT: scvtf s18, x10 +; CHECK-NEXT: scvtf s19, x12 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov v2.s[1], v17.s[0] +; CHECK-NEXT: mov x12, v5.d[1] +; CHECK-NEXT: mov v3.s[1], v6.s[0] +; CHECK-NEXT: scvtf s6, x11 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: scvtf s1, x10 +; CHECK-NEXT: mov x10, v7.d[1] +; CHECK-NEXT: scvtf s7, x9 +; CHECK-NEXT: mov v19.s[1], v18.s[0] +; CHECK-NEXT: scvtf s16, x8 +; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: scvtf s5, x11 +; CHECK-NEXT: mov v3.s[2], v6.s[0] +; CHECK-NEXT: scvtf s4, x10 +; CHECK-NEXT: mov v2.s[2], v1.s[0] +; CHECK-NEXT: scvtf s1, x12 +; CHECK-NEXT: mov v0.s[3], v7.s[0] +; CHECK-NEXT: mov v19.s[2], v5.s[0] +; CHECK-NEXT: mov v2.s[3], v16.s[0] +; CHECK-NEXT: mov v3.s[3], v4.s[0] +; CHECK-NEXT: movi v4.4s, #127, msl #8 +; CHECK-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-NEXT: mov v19.s[3], v1.s[0] ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: ushr v7.4s, v0.4s, #16 -; CHECK-NEXT: ushr v5.4s, v2.4s, #16 -; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v17.4s, v4.4s, #16 -; CHECK-NEXT: add v19.4s, v0.4s, v3.4s -; CHECK-NEXT: add v18.4s, v2.4s, v3.4s -; CHECK-NEXT: add v20.4s, v6.4s, v3.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: ushr v6.4s, v2.4s, #16 +; CHECK-NEXT: ushr v7.4s, v3.4s, #16 +; CHECK-NEXT: add v17.4s, v0.4s, v4.4s +; CHECK-NEXT: add v18.4s, v2.4s, v4.4s +; CHECK-NEXT: add v20.4s, v3.4s, v4.4s +; CHECK-NEXT: ushr v16.4s, v19.4s, #16 ; CHECK-NEXT: and v5.16b, v5.16b, v1.16b -; CHECK-NEXT: and v16.16b, v16.16b, v1.16b -; CHECK-NEXT: and v1.16b, v17.16b, v1.16b +; CHECK-NEXT: add v4.4s, v19.4s, v4.4s +; CHECK-NEXT: and v6.16b, v6.16b, v1.16b +; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: and v1.16b, v16.16b, v1.16b +; CHECK-NEXT: add v5.4s, v5.4s, v17.4s +; CHECK-NEXT: fcmeq v16.4s, v0.4s, v0.4s +; CHECK-NEXT: add v6.4s, v6.4s, v18.4s ; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s -; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v7.4s, v7.4s, v19.4s -; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s -; CHECK-NEXT: add v5.4s, v5.4s, v18.4s -; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s -; CHECK-NEXT: add v16.4s, v16.4s, v20.4s +; CHECK-NEXT: fcmeq v18.4s, v3.4s, v3.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: orr v6.4s, #64, lsl #16 -; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b -; CHECK-NEXT: mov v5.16b, v19.16b -; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b -; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: uzp2 v1.8h, v1.8h, v5.8h +; CHECK-NEXT: add v7.4s, v7.4s, v20.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-NEXT: add v1.4s, v1.4s, v4.4s +; CHECK-NEXT: fcmeq v4.4s, v19.4s, v19.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: orr v19.4s, #64, lsl #16 +; CHECK-NEXT: bit v0.16b, v5.16b, v16.16b +; CHECK-NEXT: bit v2.16b, v6.16b, v17.16b +; CHECK-NEXT: bit v3.16b, v7.16b, v18.16b +; CHECK-NEXT: bif v1.16b, v19.16b, v4.16b +; CHECK-NEXT: uzp2 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret entry: %c = sitofp <16 x i64> %a to <16 x bfloat> @@ -574,55 +655,82 @@ entry: define <16 x bfloat> @utofp_v16i64_v16bf16(<16 x i64> %a) { ; CHECK-LABEL: utofp_v16i64_v16bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v6.2d, v6.2d -; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: ucvtf v3.2d, v3.2d -; CHECK-NEXT: ucvtf v7.2d, v7.2d -; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v2.2s, v2.2d -; CHECK-NEXT: fcvtn v6.2s, v6.2d -; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: fcvtn2 v2.4s, v3.2d -; CHECK-NEXT: fcvtn2 v6.4s, v7.2d -; CHECK-NEXT: fcvtn2 v4.4s, v5.2d +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: fmov x11, d0 +; CHECK-NEXT: mov x9, v0.d[1] +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov x12, v6.d[1] +; CHECK-NEXT: ucvtf s2, x11 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: ucvtf s16, x8 +; CHECK-NEXT: fmov x8, d6 +; CHECK-NEXT: ucvtf s0, x10 +; CHECK-NEXT: mov x10, v4.d[1] +; CHECK-NEXT: ucvtf s17, x9 +; CHECK-NEXT: mov x9, v3.d[1] +; CHECK-NEXT: ucvtf s6, x12 +; CHECK-NEXT: fmov x12, d4 +; CHECK-NEXT: ucvtf s4, x11 +; CHECK-NEXT: ucvtf s3, x8 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: mov v0.s[1], v16.s[0] +; CHECK-NEXT: ucvtf s18, x10 +; CHECK-NEXT: ucvtf s19, x12 +; CHECK-NEXT: fmov x10, d1 +; CHECK-NEXT: mov v2.s[1], v17.s[0] +; CHECK-NEXT: mov x12, v5.d[1] +; CHECK-NEXT: mov v3.s[1], v6.s[0] +; CHECK-NEXT: ucvtf s6, x11 +; CHECK-NEXT: fmov x11, d5 +; CHECK-NEXT: ucvtf s1, x10 +; CHECK-NEXT: mov x10, v7.d[1] +; CHECK-NEXT: ucvtf s7, x9 +; CHECK-NEXT: mov v19.s[1], v18.s[0] +; CHECK-NEXT: ucvtf s16, x8 +; CHECK-NEXT: mov v0.s[2], v4.s[0] +; CHECK-NEXT: ucvtf s5, x11 +; CHECK-NEXT: mov v3.s[2], v6.s[0] +; CHECK-NEXT: ucvtf s4, x10 +; CHECK-NEXT: mov v2.s[2], v1.s[0] +; CHECK-NEXT: ucvtf s1, x12 +; CHECK-NEXT: mov v0.s[3], v7.s[0] +; CHECK-NEXT: mov v19.s[2], v5.s[0] +; CHECK-NEXT: mov v2.s[3], v16.s[0] +; CHECK-NEXT: mov v3.s[3], v4.s[0] +; CHECK-NEXT: movi v4.4s, #127, msl #8 +; CHECK-NEXT: ushr v5.4s, v0.4s, #16 +; CHECK-NEXT: mov v19.s[3], v1.s[0] ; CHECK-NEXT: movi v1.4s, #1 -; CHECK-NEXT: movi v3.4s, #127, msl #8 -; CHECK-NEXT: ushr v7.4s, v0.4s, #16 -; CHECK-NEXT: ushr v5.4s, v2.4s, #16 -; CHECK-NEXT: ushr v16.4s, v6.4s, #16 -; CHECK-NEXT: ushr v17.4s, v4.4s, #16 -; CHECK-NEXT: add v19.4s, v0.4s, v3.4s -; CHECK-NEXT: add v18.4s, v2.4s, v3.4s -; CHECK-NEXT: add v20.4s, v6.4s, v3.4s -; CHECK-NEXT: add v3.4s, v4.4s, v3.4s -; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: ushr v6.4s, v2.4s, #16 +; CHECK-NEXT: ushr v7.4s, v3.4s, #16 +; CHECK-NEXT: add v17.4s, v0.4s, v4.4s +; CHECK-NEXT: add v18.4s, v2.4s, v4.4s +; CHECK-NEXT: add v20.4s, v3.4s, v4.4s +; CHECK-NEXT: ushr v16.4s, v19.4s, #16 ; CHECK-NEXT: and v5.16b, v5.16b, v1.16b -; CHECK-NEXT: and v16.16b, v16.16b, v1.16b -; CHECK-NEXT: and v1.16b, v17.16b, v1.16b +; CHECK-NEXT: add v4.4s, v19.4s, v4.4s +; CHECK-NEXT: and v6.16b, v6.16b, v1.16b +; CHECK-NEXT: and v7.16b, v7.16b, v1.16b +; CHECK-NEXT: and v1.16b, v16.16b, v1.16b +; CHECK-NEXT: add v5.4s, v5.4s, v17.4s +; CHECK-NEXT: fcmeq v16.4s, v0.4s, v0.4s +; CHECK-NEXT: add v6.4s, v6.4s, v18.4s ; CHECK-NEXT: fcmeq v17.4s, v2.4s, v2.4s -; CHECK-NEXT: orr v2.4s, #64, lsl #16 -; CHECK-NEXT: add v7.4s, v7.4s, v19.4s -; CHECK-NEXT: fcmeq v19.4s, v6.4s, v6.4s -; CHECK-NEXT: add v5.4s, v5.4s, v18.4s -; CHECK-NEXT: fcmeq v18.4s, v0.4s, v0.4s -; CHECK-NEXT: add v1.4s, v1.4s, v3.4s -; CHECK-NEXT: fcmeq v3.4s, v4.4s, v4.4s -; CHECK-NEXT: add v16.4s, v16.4s, v20.4s +; CHECK-NEXT: fcmeq v18.4s, v3.4s, v3.4s ; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: orr v6.4s, #64, lsl #16 -; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: bit v2.16b, v5.16b, v17.16b -; CHECK-NEXT: mov v5.16b, v19.16b -; CHECK-NEXT: bit v0.16b, v7.16b, v18.16b -; CHECK-NEXT: bif v1.16b, v4.16b, v3.16b -; CHECK-NEXT: bsl v5.16b, v16.16b, v6.16b -; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h -; CHECK-NEXT: uzp2 v1.8h, v1.8h, v5.8h +; CHECK-NEXT: add v7.4s, v7.4s, v20.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-NEXT: add v1.4s, v1.4s, v4.4s +; CHECK-NEXT: fcmeq v4.4s, v19.4s, v19.4s +; CHECK-NEXT: orr v3.4s, #64, lsl #16 +; CHECK-NEXT: orr v19.4s, #64, lsl #16 +; CHECK-NEXT: bit v0.16b, v5.16b, v16.16b +; CHECK-NEXT: bit v2.16b, v6.16b, v17.16b +; CHECK-NEXT: bit v3.16b, v7.16b, v18.16b +; CHECK-NEXT: bif v1.16b, v19.16b, v4.16b +; CHECK-NEXT: uzp2 v0.8h, v2.8h, v0.8h +; CHECK-NEXT: uzp2 v1.8h, v1.8h, v3.8h ; CHECK-NEXT: ret entry: %c = uitofp <16 x i64> %a to <16 x bfloat> @@ -632,107 +740,162 @@ entry: define <32 x bfloat> @stofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: stofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v17.2d, v2.2d -; CHECK-NEXT: scvtf v18.2d, v0.2d -; CHECK-NEXT: scvtf v19.2d, v3.2d -; CHECK-NEXT: scvtf v3.2d, v6.2d -; CHECK-NEXT: ldp q21, q20, [sp, #32] -; CHECK-NEXT: scvtf v4.2d, v4.2d -; CHECK-NEXT: scvtf v6.2d, v7.2d -; CHECK-NEXT: scvtf v5.2d, v5.2d -; CHECK-NEXT: ldp q24, q23, [sp, #64] -; CHECK-NEXT: movi v16.4s, #1 -; CHECK-NEXT: fcvtn v0.2s, v17.2d -; CHECK-NEXT: scvtf v17.2d, v1.2d -; CHECK-NEXT: fcvtn v1.2s, v18.2d -; CHECK-NEXT: fcvtn v3.2s, v3.2d -; CHECK-NEXT: ldp q18, q7, [sp] -; CHECK-NEXT: scvtf v21.2d, v21.2d -; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: movi v2.4s, #127, msl #8 -; CHECK-NEXT: scvtf v20.2d, v20.2d -; CHECK-NEXT: fcvtn2 v0.4s, v19.2d -; CHECK-NEXT: ldp q22, q19, [sp, #96] -; CHECK-NEXT: fcvtn2 v1.4s, v17.2d -; CHECK-NEXT: fcvtn2 v3.4s, v6.2d -; CHECK-NEXT: scvtf v18.2d, v18.2d -; CHECK-NEXT: scvtf v17.2d, v24.2d -; CHECK-NEXT: fcvtn v6.2s, v21.2d -; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: scvtf v22.2d, v22.2d -; CHECK-NEXT: scvtf v21.2d, v23.2d -; CHECK-NEXT: scvtf v7.2d, v7.2d -; CHECK-NEXT: ushr v24.4s, v0.4s, #16 -; CHECK-NEXT: add v5.4s, v0.4s, v2.4s -; CHECK-NEXT: scvtf v19.2d, v19.2d -; CHECK-NEXT: ushr v23.4s, v1.4s, #16 -; CHECK-NEXT: ushr v25.4s, v3.4s, #16 -; CHECK-NEXT: fcvtn v18.2s, v18.2d -; CHECK-NEXT: fcvtn2 v6.4s, v20.2d -; CHECK-NEXT: add v26.4s, v1.4s, v2.4s -; CHECK-NEXT: fcvtn v17.2s, v17.2d -; CHECK-NEXT: and v24.16b, v24.16b, v16.16b -; CHECK-NEXT: fcvtn v22.2s, v22.2d -; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s -; CHECK-NEXT: and v23.16b, v23.16b, v16.16b -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s -; CHECK-NEXT: fcvtn2 v18.4s, v7.2d -; CHECK-NEXT: add v7.4s, v3.4s, v2.4s -; CHECK-NEXT: orr v3.4s, #64, lsl #16 -; CHECK-NEXT: add v5.4s, v24.4s, v5.4s -; CHECK-NEXT: and v24.16b, v25.16b, v16.16b -; CHECK-NEXT: ushr v25.4s, v4.4s, #16 -; CHECK-NEXT: fcvtn2 v22.4s, v19.2d -; CHECK-NEXT: add v19.4s, v23.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v6.4s, #16 -; CHECK-NEXT: fcvtn2 v17.4s, v21.2d -; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov x9, v3.d[1] +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: scvtf s2, x10 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: scvtf s19, x9 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: scvtf s16, x11 +; CHECK-NEXT: mov x11, v6.d[1] +; CHECK-NEXT: scvtf s0, x12 +; CHECK-NEXT: scvtf s18, x8 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: scvtf s20, x10 +; CHECK-NEXT: scvtf s17, x9 +; CHECK-NEXT: mov x9, v7.d[1] +; CHECK-NEXT: mov x10, v4.d[1] +; CHECK-NEXT: scvtf s21, x11 +; CHECK-NEXT: fmov x11, d6 +; CHECK-NEXT: mov v2.s[1], v18.s[0] +; CHECK-NEXT: scvtf s25, x8 +; CHECK-NEXT: movi v6.4s, #127, msl #8 +; CHECK-NEXT: mov v0.s[1], v20.s[0] +; CHECK-NEXT: ldp q24, q20, [sp, #32] +; CHECK-NEXT: scvtf s22, x9 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: scvtf s1, x11 +; CHECK-NEXT: scvtf s26, x10 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov v2.s[2], v16.s[0] +; CHECK-NEXT: ldp q18, q16, [sp] +; CHECK-NEXT: mov x8, v24.d[1] +; CHECK-NEXT: scvtf s4, x9 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov v0.s[2], v17.s[0] +; CHECK-NEXT: mov v1.s[1], v21.s[0] +; CHECK-NEXT: scvtf s23, x11 +; CHECK-NEXT: mov x11, v5.d[1] +; CHECK-NEXT: mov v2.s[3], v19.s[0] +; CHECK-NEXT: scvtf s21, x8 +; CHECK-NEXT: mov x8, v20.d[1] +; CHECK-NEXT: scvtf s17, x9 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: mov v4.s[1], v26.s[0] +; CHECK-NEXT: mov v0.s[3], v25.s[0] +; CHECK-NEXT: ldp q26, q24, [sp, #96] +; CHECK-NEXT: mov v1.s[2], v23.s[0] +; CHECK-NEXT: ldp q25, q23, [sp, #64] +; CHECK-NEXT: scvtf s7, x11 +; CHECK-NEXT: scvtf s27, x8 +; CHECK-NEXT: fmov x8, d18 +; CHECK-NEXT: scvtf s5, x9 +; CHECK-NEXT: mov x10, v26.d[1] +; CHECK-NEXT: mov x9, v18.d[1] +; CHECK-NEXT: fmov x11, d20 +; CHECK-NEXT: mov v4.s[2], v17.s[0] +; CHECK-NEXT: mov v1.s[3], v22.s[0] +; CHECK-NEXT: ushr v19.4s, v2.4s, #16 +; CHECK-NEXT: scvtf s17, x8 +; CHECK-NEXT: fmov x8, d26 +; CHECK-NEXT: add v26.4s, v2.4s, v6.4s +; CHECK-NEXT: scvtf s22, x11 +; CHECK-NEXT: mov x11, v25.d[1] +; CHECK-NEXT: mov v5.s[1], v21.s[0] +; CHECK-NEXT: scvtf s28, x10 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: scvtf s21, x9 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: scvtf s18, x8 +; CHECK-NEXT: mov x8, v16.d[1] +; CHECK-NEXT: mov v4.s[3], v7.s[0] +; CHECK-NEXT: and v19.16b, v19.16b, v3.16b +; CHECK-NEXT: scvtf s16, x10 +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: scvtf s25, x11 +; CHECK-NEXT: scvtf s20, x9 +; CHECK-NEXT: mov x9, v24.d[1] +; CHECK-NEXT: mov v17.s[1], v21.s[0] +; CHECK-NEXT: fmov x11, d23 +; CHECK-NEXT: mov v18.s[1], v28.s[0] +; CHECK-NEXT: scvtf s24, x8 +; CHECK-NEXT: scvtf s21, x10 +; CHECK-NEXT: mov x10, v23.d[1] +; CHECK-NEXT: mov v5.s[2], v22.s[0] +; CHECK-NEXT: ushr v22.4s, v1.4s, #16 +; CHECK-NEXT: ushr v28.4s, v0.4s, #16 +; CHECK-NEXT: scvtf s23, x11 +; CHECK-NEXT: mov v20.s[1], v25.s[0] +; CHECK-NEXT: scvtf s25, x9 +; CHECK-NEXT: mov v17.s[2], v16.s[0] +; CHECK-NEXT: add v16.4s, v19.4s, v26.4s +; CHECK-NEXT: ushr v26.4s, v4.4s, #16 +; CHECK-NEXT: mov v18.s[2], v21.s[0] +; CHECK-NEXT: scvtf s7, x10 +; CHECK-NEXT: and v22.16b, v22.16b, v3.16b +; CHECK-NEXT: mov v5.s[3], v27.s[0] +; CHECK-NEXT: and v21.16b, v28.16b, v3.16b +; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s +; CHECK-NEXT: mov v20.s[2], v23.s[0] +; CHECK-NEXT: add v23.4s, v0.4s, v6.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-NEXT: mov v17.s[3], v24.s[0] +; CHECK-NEXT: add v24.4s, v1.4s, v6.4s +; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s +; CHECK-NEXT: mov v18.s[3], v25.s[0] +; CHECK-NEXT: add v25.4s, v4.4s, v6.4s ; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: and v23.16b, v25.16b, v16.16b -; CHECK-NEXT: add v25.4s, v4.4s, v2.4s -; CHECK-NEXT: add v7.4s, v24.4s, v7.4s -; CHECK-NEXT: ushr v24.4s, v18.4s, #16 -; CHECK-NEXT: add v30.4s, v18.4s, v2.4s -; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b -; CHECK-NEXT: ushr v28.4s, v22.4s, #16 -; CHECK-NEXT: add v31.4s, v22.4s, v2.4s +; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b +; CHECK-NEXT: mov v20.s[3], v7.s[0] +; CHECK-NEXT: add v22.4s, v22.4s, v24.4s +; CHECK-NEXT: add v7.4s, v21.4s, v23.4s +; CHECK-NEXT: ushr v24.4s, v17.4s, #16 +; CHECK-NEXT: and v23.16b, v26.16b, v3.16b +; CHECK-NEXT: ushr v26.4s, v5.4s, #16 +; CHECK-NEXT: ushr v28.4s, v18.4s, #16 +; CHECK-NEXT: add v30.4s, v17.4s, v6.4s +; CHECK-NEXT: add v31.4s, v18.4s, v6.4s +; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b +; CHECK-NEXT: ushr v29.4s, v20.4s, #16 +; CHECK-NEXT: and v24.16b, v24.16b, v3.16b ; CHECK-NEXT: add v23.4s, v23.4s, v25.4s -; CHECK-NEXT: and v25.16b, v26.16b, v16.16b -; CHECK-NEXT: add v26.4s, v6.4s, v2.4s -; CHECK-NEXT: ushr v29.4s, v17.4s, #16 -; CHECK-NEXT: and v24.16b, v24.16b, v16.16b -; CHECK-NEXT: add v2.4s, v17.4s, v2.4s -; CHECK-NEXT: and v28.16b, v28.16b, v16.16b -; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b -; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b -; CHECK-NEXT: add v25.4s, v25.4s, v26.4s -; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s -; CHECK-NEXT: orr v6.4s, #64, lsl #16 -; CHECK-NEXT: and v16.16b, v29.16b, v16.16b +; CHECK-NEXT: and v28.16b, v28.16b, v3.16b +; CHECK-NEXT: and v25.16b, v26.16b, v3.16b +; CHECK-NEXT: add v26.4s, v5.4s, v6.4s +; CHECK-NEXT: add v6.4s, v20.4s, v6.4s +; CHECK-NEXT: and v3.16b, v29.16b, v3.16b ; CHECK-NEXT: add v24.4s, v24.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s +; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s ; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s +; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s ; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: add v25.4s, v25.4s, v26.4s +; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s ; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v22.4s, #64, lsl #16 -; CHECK-NEXT: mov v5.16b, v26.16b -; CHECK-NEXT: add v2.4s, v16.4s, v2.4s -; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s +; CHECK-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s +; CHECK-NEXT: orr v5.4s, #64, lsl #16 ; CHECK-NEXT: orr v17.4s, #64, lsl #16 -; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; CHECK-NEXT: mov v7.16b, v31.16b +; CHECK-NEXT: orr v18.4s, #64, lsl #16 +; CHECK-NEXT: orr v20.4s, #64, lsl #16 +; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b +; CHECK-NEXT: mov v7.16b, v30.16b +; CHECK-NEXT: mov v16.16b, v31.16b ; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b -; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b -; CHECK-NEXT: mov v6.16b, v30.16b -; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b -; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b -; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b -; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h -; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h -; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h +; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b +; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b +; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b +; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h +; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h +; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h ; CHECK-NEXT: ret entry: %c = sitofp <32 x i64> %a to <32 x bfloat> @@ -742,107 +905,162 @@ entry: define <32 x bfloat> @utofp_v32i64_v32bf16(<32 x i64> %a) { ; CHECK-LABEL: utofp_v32i64_v32bf16: ; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v17.2d, v2.2d -; CHECK-NEXT: ucvtf v18.2d, v0.2d -; CHECK-NEXT: ucvtf v19.2d, v3.2d -; CHECK-NEXT: ucvtf v3.2d, v6.2d -; CHECK-NEXT: ldp q21, q20, [sp, #32] -; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v6.2d, v7.2d -; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: ldp q24, q23, [sp, #64] -; CHECK-NEXT: movi v16.4s, #1 -; CHECK-NEXT: fcvtn v0.2s, v17.2d -; CHECK-NEXT: ucvtf v17.2d, v1.2d -; CHECK-NEXT: fcvtn v1.2s, v18.2d -; CHECK-NEXT: fcvtn v3.2s, v3.2d -; CHECK-NEXT: ldp q18, q7, [sp] -; CHECK-NEXT: ucvtf v21.2d, v21.2d -; CHECK-NEXT: fcvtn v4.2s, v4.2d -; CHECK-NEXT: movi v2.4s, #127, msl #8 -; CHECK-NEXT: ucvtf v20.2d, v20.2d -; CHECK-NEXT: fcvtn2 v0.4s, v19.2d -; CHECK-NEXT: ldp q22, q19, [sp, #96] -; CHECK-NEXT: fcvtn2 v1.4s, v17.2d -; CHECK-NEXT: fcvtn2 v3.4s, v6.2d -; CHECK-NEXT: ucvtf v18.2d, v18.2d -; CHECK-NEXT: ucvtf v17.2d, v24.2d -; CHECK-NEXT: fcvtn v6.2s, v21.2d -; CHECK-NEXT: fcvtn2 v4.4s, v5.2d -; CHECK-NEXT: ucvtf v22.2d, v22.2d -; CHECK-NEXT: ucvtf v21.2d, v23.2d -; CHECK-NEXT: ucvtf v7.2d, v7.2d -; CHECK-NEXT: ushr v24.4s, v0.4s, #16 -; CHECK-NEXT: add v5.4s, v0.4s, v2.4s -; CHECK-NEXT: ucvtf v19.2d, v19.2d -; CHECK-NEXT: ushr v23.4s, v1.4s, #16 -; CHECK-NEXT: ushr v25.4s, v3.4s, #16 -; CHECK-NEXT: fcvtn v18.2s, v18.2d -; CHECK-NEXT: fcvtn2 v6.4s, v20.2d -; CHECK-NEXT: add v26.4s, v1.4s, v2.4s -; CHECK-NEXT: fcvtn v17.2s, v17.2d -; CHECK-NEXT: and v24.16b, v24.16b, v16.16b -; CHECK-NEXT: fcvtn v22.2s, v22.2d -; CHECK-NEXT: fcmeq v20.4s, v0.4s, v0.4s -; CHECK-NEXT: and v23.16b, v23.16b, v16.16b -; CHECK-NEXT: orr v0.4s, #64, lsl #16 -; CHECK-NEXT: fcmeq v27.4s, v3.4s, v3.4s -; CHECK-NEXT: fcvtn2 v18.4s, v7.2d -; CHECK-NEXT: add v7.4s, v3.4s, v2.4s -; CHECK-NEXT: orr v3.4s, #64, lsl #16 -; CHECK-NEXT: add v5.4s, v24.4s, v5.4s -; CHECK-NEXT: and v24.16b, v25.16b, v16.16b -; CHECK-NEXT: ushr v25.4s, v4.4s, #16 -; CHECK-NEXT: fcvtn2 v22.4s, v19.2d -; CHECK-NEXT: add v19.4s, v23.4s, v26.4s -; CHECK-NEXT: ushr v26.4s, v6.4s, #16 -; CHECK-NEXT: fcvtn2 v17.4s, v21.2d -; CHECK-NEXT: fcmeq v21.4s, v1.4s, v1.4s +; CHECK-NEXT: fmov x10, d2 +; CHECK-NEXT: mov x9, v3.d[1] +; CHECK-NEXT: mov x8, v2.d[1] +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: fmov x12, d0 +; CHECK-NEXT: movi v3.4s, #1 +; CHECK-NEXT: ucvtf s2, x10 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ucvtf s19, x9 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: ucvtf s16, x11 +; CHECK-NEXT: mov x11, v6.d[1] +; CHECK-NEXT: ucvtf s0, x12 +; CHECK-NEXT: ucvtf s18, x8 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: ucvtf s20, x10 +; CHECK-NEXT: ucvtf s17, x9 +; CHECK-NEXT: mov x9, v7.d[1] +; CHECK-NEXT: mov x10, v4.d[1] +; CHECK-NEXT: ucvtf s21, x11 +; CHECK-NEXT: fmov x11, d6 +; CHECK-NEXT: mov v2.s[1], v18.s[0] +; CHECK-NEXT: ucvtf s25, x8 +; CHECK-NEXT: movi v6.4s, #127, msl #8 +; CHECK-NEXT: mov v0.s[1], v20.s[0] +; CHECK-NEXT: ldp q24, q20, [sp, #32] +; CHECK-NEXT: ucvtf s22, x9 +; CHECK-NEXT: fmov x9, d4 +; CHECK-NEXT: ucvtf s1, x11 +; CHECK-NEXT: ucvtf s26, x10 +; CHECK-NEXT: fmov x11, d7 +; CHECK-NEXT: mov v2.s[2], v16.s[0] +; CHECK-NEXT: ldp q18, q16, [sp] +; CHECK-NEXT: mov x8, v24.d[1] +; CHECK-NEXT: ucvtf s4, x9 +; CHECK-NEXT: fmov x9, d5 +; CHECK-NEXT: mov v0.s[2], v17.s[0] +; CHECK-NEXT: mov v1.s[1], v21.s[0] +; CHECK-NEXT: ucvtf s23, x11 +; CHECK-NEXT: mov x11, v5.d[1] +; CHECK-NEXT: mov v2.s[3], v19.s[0] +; CHECK-NEXT: ucvtf s21, x8 +; CHECK-NEXT: mov x8, v20.d[1] +; CHECK-NEXT: ucvtf s17, x9 +; CHECK-NEXT: fmov x9, d24 +; CHECK-NEXT: mov v4.s[1], v26.s[0] +; CHECK-NEXT: mov v0.s[3], v25.s[0] +; CHECK-NEXT: ldp q26, q24, [sp, #96] +; CHECK-NEXT: mov v1.s[2], v23.s[0] +; CHECK-NEXT: ldp q25, q23, [sp, #64] +; CHECK-NEXT: ucvtf s7, x11 +; CHECK-NEXT: ucvtf s27, x8 +; CHECK-NEXT: fmov x8, d18 +; CHECK-NEXT: ucvtf s5, x9 +; CHECK-NEXT: mov x10, v26.d[1] +; CHECK-NEXT: mov x9, v18.d[1] +; CHECK-NEXT: fmov x11, d20 +; CHECK-NEXT: mov v4.s[2], v17.s[0] +; CHECK-NEXT: mov v1.s[3], v22.s[0] +; CHECK-NEXT: ushr v19.4s, v2.4s, #16 +; CHECK-NEXT: ucvtf s17, x8 +; CHECK-NEXT: fmov x8, d26 +; CHECK-NEXT: add v26.4s, v2.4s, v6.4s +; CHECK-NEXT: ucvtf s22, x11 +; CHECK-NEXT: mov x11, v25.d[1] +; CHECK-NEXT: mov v5.s[1], v21.s[0] +; CHECK-NEXT: ucvtf s28, x10 +; CHECK-NEXT: fmov x10, d16 +; CHECK-NEXT: ucvtf s21, x9 +; CHECK-NEXT: fmov x9, d25 +; CHECK-NEXT: ucvtf s18, x8 +; CHECK-NEXT: mov x8, v16.d[1] +; CHECK-NEXT: mov v4.s[3], v7.s[0] +; CHECK-NEXT: and v19.16b, v19.16b, v3.16b +; CHECK-NEXT: ucvtf s16, x10 +; CHECK-NEXT: fmov x10, d24 +; CHECK-NEXT: ucvtf s25, x11 +; CHECK-NEXT: ucvtf s20, x9 +; CHECK-NEXT: mov x9, v24.d[1] +; CHECK-NEXT: mov v17.s[1], v21.s[0] +; CHECK-NEXT: fmov x11, d23 +; CHECK-NEXT: mov v18.s[1], v28.s[0] +; CHECK-NEXT: ucvtf s24, x8 +; CHECK-NEXT: ucvtf s21, x10 +; CHECK-NEXT: mov x10, v23.d[1] +; CHECK-NEXT: mov v5.s[2], v22.s[0] +; CHECK-NEXT: ushr v22.4s, v1.4s, #16 +; CHECK-NEXT: ushr v28.4s, v0.4s, #16 +; CHECK-NEXT: ucvtf s23, x11 +; CHECK-NEXT: mov v20.s[1], v25.s[0] +; CHECK-NEXT: ucvtf s25, x9 +; CHECK-NEXT: mov v17.s[2], v16.s[0] +; CHECK-NEXT: add v16.4s, v19.4s, v26.4s +; CHECK-NEXT: ushr v26.4s, v4.4s, #16 +; CHECK-NEXT: mov v18.s[2], v21.s[0] +; CHECK-NEXT: ucvtf s7, x10 +; CHECK-NEXT: and v22.16b, v22.16b, v3.16b +; CHECK-NEXT: mov v5.s[3], v27.s[0] +; CHECK-NEXT: and v21.16b, v28.16b, v3.16b +; CHECK-NEXT: fcmeq v19.4s, v2.4s, v2.4s +; CHECK-NEXT: mov v20.s[2], v23.s[0] +; CHECK-NEXT: add v23.4s, v0.4s, v6.4s +; CHECK-NEXT: orr v2.4s, #64, lsl #16 +; CHECK-NEXT: mov v17.s[3], v24.s[0] +; CHECK-NEXT: add v24.4s, v1.4s, v6.4s +; CHECK-NEXT: fcmeq v27.4s, v1.4s, v1.4s +; CHECK-NEXT: mov v18.s[3], v25.s[0] +; CHECK-NEXT: add v25.4s, v4.4s, v6.4s ; CHECK-NEXT: orr v1.4s, #64, lsl #16 -; CHECK-NEXT: and v23.16b, v25.16b, v16.16b -; CHECK-NEXT: add v25.4s, v4.4s, v2.4s -; CHECK-NEXT: add v7.4s, v24.4s, v7.4s -; CHECK-NEXT: ushr v24.4s, v18.4s, #16 -; CHECK-NEXT: add v30.4s, v18.4s, v2.4s -; CHECK-NEXT: bit v0.16b, v5.16b, v20.16b -; CHECK-NEXT: ushr v28.4s, v22.4s, #16 -; CHECK-NEXT: add v31.4s, v22.4s, v2.4s +; CHECK-NEXT: bit v2.16b, v16.16b, v19.16b +; CHECK-NEXT: mov v20.s[3], v7.s[0] +; CHECK-NEXT: add v22.4s, v22.4s, v24.4s +; CHECK-NEXT: add v7.4s, v21.4s, v23.4s +; CHECK-NEXT: ushr v24.4s, v17.4s, #16 +; CHECK-NEXT: and v23.16b, v26.16b, v3.16b +; CHECK-NEXT: ushr v26.4s, v5.4s, #16 +; CHECK-NEXT: ushr v28.4s, v18.4s, #16 +; CHECK-NEXT: add v30.4s, v17.4s, v6.4s +; CHECK-NEXT: add v31.4s, v18.4s, v6.4s +; CHECK-NEXT: fcmeq v21.4s, v0.4s, v0.4s +; CHECK-NEXT: orr v0.4s, #64, lsl #16 +; CHECK-NEXT: bit v1.16b, v22.16b, v27.16b +; CHECK-NEXT: ushr v29.4s, v20.4s, #16 +; CHECK-NEXT: and v24.16b, v24.16b, v3.16b ; CHECK-NEXT: add v23.4s, v23.4s, v25.4s -; CHECK-NEXT: and v25.16b, v26.16b, v16.16b -; CHECK-NEXT: add v26.4s, v6.4s, v2.4s -; CHECK-NEXT: ushr v29.4s, v17.4s, #16 -; CHECK-NEXT: and v24.16b, v24.16b, v16.16b -; CHECK-NEXT: add v2.4s, v17.4s, v2.4s -; CHECK-NEXT: and v28.16b, v28.16b, v16.16b -; CHECK-NEXT: bit v3.16b, v7.16b, v27.16b -; CHECK-NEXT: bit v1.16b, v19.16b, v21.16b -; CHECK-NEXT: add v25.4s, v25.4s, v26.4s -; CHECK-NEXT: fcmeq v26.4s, v6.4s, v6.4s -; CHECK-NEXT: orr v6.4s, #64, lsl #16 -; CHECK-NEXT: and v16.16b, v29.16b, v16.16b +; CHECK-NEXT: and v28.16b, v28.16b, v3.16b +; CHECK-NEXT: and v25.16b, v26.16b, v3.16b +; CHECK-NEXT: add v26.4s, v5.4s, v6.4s +; CHECK-NEXT: add v6.4s, v20.4s, v6.4s +; CHECK-NEXT: and v3.16b, v29.16b, v3.16b ; CHECK-NEXT: add v24.4s, v24.4s, v30.4s -; CHECK-NEXT: fcmeq v30.4s, v18.4s, v18.4s +; CHECK-NEXT: fcmeq v30.4s, v17.4s, v17.4s ; CHECK-NEXT: add v28.4s, v28.4s, v31.4s -; CHECK-NEXT: fcmeq v31.4s, v22.4s, v22.4s +; CHECK-NEXT: fcmeq v31.4s, v18.4s, v18.4s ; CHECK-NEXT: fcmeq v29.4s, v4.4s, v4.4s +; CHECK-NEXT: add v25.4s, v25.4s, v26.4s +; CHECK-NEXT: fcmeq v26.4s, v5.4s, v5.4s ; CHECK-NEXT: orr v4.4s, #64, lsl #16 -; CHECK-NEXT: orr v18.4s, #64, lsl #16 -; CHECK-NEXT: orr v22.4s, #64, lsl #16 -; CHECK-NEXT: mov v5.16b, v26.16b -; CHECK-NEXT: add v2.4s, v16.4s, v2.4s -; CHECK-NEXT: fcmeq v16.4s, v17.4s, v17.4s +; CHECK-NEXT: add v3.4s, v3.4s, v6.4s +; CHECK-NEXT: fcmeq v6.4s, v20.4s, v20.4s +; CHECK-NEXT: orr v5.4s, #64, lsl #16 ; CHECK-NEXT: orr v17.4s, #64, lsl #16 -; CHECK-NEXT: uzp2 v0.8h, v1.8h, v0.8h -; CHECK-NEXT: mov v7.16b, v31.16b +; CHECK-NEXT: orr v18.4s, #64, lsl #16 +; CHECK-NEXT: orr v20.4s, #64, lsl #16 +; CHECK-NEXT: bit v0.16b, v7.16b, v21.16b +; CHECK-NEXT: mov v7.16b, v30.16b +; CHECK-NEXT: mov v16.16b, v31.16b ; CHECK-NEXT: bit v4.16b, v23.16b, v29.16b -; CHECK-NEXT: bsl v5.16b, v25.16b, v6.16b -; CHECK-NEXT: mov v6.16b, v30.16b -; CHECK-NEXT: bsl v16.16b, v2.16b, v17.16b -; CHECK-NEXT: bsl v7.16b, v28.16b, v22.16b -; CHECK-NEXT: bsl v6.16b, v24.16b, v18.16b -; CHECK-NEXT: uzp2 v1.8h, v4.8h, v3.8h -; CHECK-NEXT: uzp2 v3.8h, v16.8h, v7.8h -; CHECK-NEXT: uzp2 v2.8h, v6.8h, v5.8h +; CHECK-NEXT: bit v5.16b, v25.16b, v26.16b +; CHECK-NEXT: bif v3.16b, v20.16b, v6.16b +; CHECK-NEXT: bsl v7.16b, v24.16b, v17.16b +; CHECK-NEXT: bsl v16.16b, v28.16b, v18.16b +; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h +; CHECK-NEXT: uzp2 v1.8h, v4.8h, v1.8h +; CHECK-NEXT: uzp2 v2.8h, v7.8h, v5.8h +; CHECK-NEXT: uzp2 v3.8h, v3.8h, v16.8h ; CHECK-NEXT: ret entry: %c = uitofp <32 x i64> %a to <32 x bfloat> diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index 81c1a64f2d434..07957c117868d 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -4421,22 +4421,42 @@ entry: } define <2 x float> @stofp_v2i64_v2f32(<2 x i64> %a) { -; CHECK-LABEL: stofp_v2i64_v2f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v2i64_v2f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: scvtf s0, x9 +; CHECK-SD-NEXT: scvtf s1, x8 +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v2i64_v2f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: scvtf v0.2d, v0.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: ret entry: %c = sitofp <2 x i64> %a to <2 x float> ret <2 x float> %c } define <2 x float> @utofp_v2i64_v2f32(<2 x i64> %a) { -; CHECK-LABEL: utofp_v2i64_v2f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v2i64_v2f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: ucvtf s0, x9 +; CHECK-SD-NEXT: ucvtf s1, x8 +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v2i64_v2f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: ret entry: %c = uitofp <2 x i64> %a to <2 x float> ret <2 x float> %c @@ -4446,13 +4466,18 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-SD-LABEL: stofp_v3i64_v3f32: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: scvtf v1.2d, v2.2d -; CHECK-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d -; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: scvtf s3, x8 +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: scvtf s1, x8 +; CHECK-SD-NEXT: fmov x8, d2 +; CHECK-SD-NEXT: mov v0.s[0], v3.s[0] +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: scvtf s1, x8 +; CHECK-SD-NEXT: mov v0.s[2], v1.s[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: stofp_v3i64_v3f32: @@ -4478,13 +4503,18 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-SD-LABEL: utofp_v3i64_v3f32: ; CHECK-SD: // %bb.0: // %entry ; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov x8, d0 ; CHECK-SD-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-SD-NEXT: // kill: def $d2 killed $d2 def $q2 -; CHECK-SD-NEXT: mov v0.d[1], v1.d[0] -; CHECK-SD-NEXT: ucvtf v1.2d, v2.2d -; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d -; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d -; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-SD-NEXT: movi v0.2d, #0000000000000000 +; CHECK-SD-NEXT: ucvtf s3, x8 +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: ucvtf s1, x8 +; CHECK-SD-NEXT: fmov x8, d2 +; CHECK-SD-NEXT: mov v0.s[0], v3.s[0] +; CHECK-SD-NEXT: mov v0.s[1], v1.s[0] +; CHECK-SD-NEXT: ucvtf s1, x8 +; CHECK-SD-NEXT: mov v0.s[2], v1.s[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: utofp_v3i64_v3f32: @@ -4507,26 +4537,56 @@ entry: } define <4 x float> @stofp_v4i64_v4f32(<4 x i64> %a) { -; CHECK-LABEL: stofp_v4i64_v4f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v4i64_v4f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: scvtf s0, x9 +; CHECK-SD-NEXT: mov x9, v1.d[1] +; CHECK-SD-NEXT: scvtf s2, x8 +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: scvtf s1, x8 +; CHECK-SD-NEXT: mov v0.s[1], v2.s[0] +; CHECK-SD-NEXT: mov v0.s[2], v1.s[0] +; CHECK-SD-NEXT: scvtf s1, x9 +; CHECK-SD-NEXT: mov v0.s[3], v1.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v4i64_v4f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: scvtf v0.2d, v0.2d +; CHECK-GI-NEXT: scvtf v1.2d, v1.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-GI-NEXT: ret entry: %c = sitofp <4 x i64> %a to <4 x float> ret <4 x float> %c } define <4 x float> @utofp_v4i64_v4f32(<4 x i64> %a) { -; CHECK-LABEL: utofp_v4i64_v4f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v1.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn2 v0.4s, v1.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v4i64_v4f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: fmov x9, d0 +; CHECK-SD-NEXT: ucvtf s0, x9 +; CHECK-SD-NEXT: mov x9, v1.d[1] +; CHECK-SD-NEXT: ucvtf s2, x8 +; CHECK-SD-NEXT: fmov x8, d1 +; CHECK-SD-NEXT: ucvtf s1, x8 +; CHECK-SD-NEXT: mov v0.s[1], v2.s[0] +; CHECK-SD-NEXT: mov v0.s[2], v1.s[0] +; CHECK-SD-NEXT: ucvtf s1, x9 +; CHECK-SD-NEXT: mov v0.s[3], v1.s[0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v4i64_v4f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d +; CHECK-GI-NEXT: ucvtf v1.2d, v1.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: fcvtn2 v0.4s, v1.2d +; CHECK-GI-NEXT: ret entry: %c = uitofp <4 x i64> %a to <4 x float> ret <4 x float> %c @@ -4535,14 +4595,29 @@ entry: define <8 x float> @stofp_v8i64_v8f32(<8 x i64> %a) { ; CHECK-SD-LABEL: stofp_v8i64_v8f32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-SD-NEXT: scvtf v2.2d, v2.2d -; CHECK-SD-NEXT: scvtf v4.2d, v1.2d -; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d -; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d -; CHECK-SD-NEXT: scvtf v2.2d, v3.2d -; CHECK-SD-NEXT: fcvtn2 v0.4s, v4.2d -; CHECK-SD-NEXT: fcvtn2 v1.4s, v2.2d +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: mov x9, v2.d[1] +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: fmov x11, d2 +; CHECK-SD-NEXT: scvtf s0, x10 +; CHECK-SD-NEXT: mov x10, v3.d[1] +; CHECK-SD-NEXT: scvtf s4, x8 +; CHECK-SD-NEXT: scvtf s5, x9 +; CHECK-SD-NEXT: scvtf s2, x11 +; CHECK-SD-NEXT: fmov x9, d1 +; CHECK-SD-NEXT: fmov x11, d3 +; CHECK-SD-NEXT: mov x8, v1.d[1] +; CHECK-SD-NEXT: scvtf s1, x9 +; CHECK-SD-NEXT: mov v0.s[1], v4.s[0] +; CHECK-SD-NEXT: scvtf s3, x11 +; CHECK-SD-NEXT: mov v2.s[1], v5.s[0] +; CHECK-SD-NEXT: scvtf s4, x8 +; CHECK-SD-NEXT: mov v0.s[2], v1.s[0] +; CHECK-SD-NEXT: scvtf s1, x10 +; CHECK-SD-NEXT: mov v2.s[2], v3.s[0] +; CHECK-SD-NEXT: mov v0.s[3], v4.s[0] +; CHECK-SD-NEXT: mov v2.s[3], v1.s[0] +; CHECK-SD-NEXT: mov v1.16b, v2.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: stofp_v8i64_v8f32: @@ -4564,14 +4639,29 @@ entry: define <8 x float> @utofp_v8i64_v8f32(<8 x i64> %a) { ; CHECK-SD-LABEL: utofp_v8i64_v8f32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d -; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d -; CHECK-SD-NEXT: ucvtf v4.2d, v1.2d -; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d -; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d -; CHECK-SD-NEXT: ucvtf v2.2d, v3.2d -; CHECK-SD-NEXT: fcvtn2 v0.4s, v4.2d -; CHECK-SD-NEXT: fcvtn2 v1.4s, v2.2d +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: mov x9, v2.d[1] +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: fmov x11, d2 +; CHECK-SD-NEXT: ucvtf s0, x10 +; CHECK-SD-NEXT: mov x10, v3.d[1] +; CHECK-SD-NEXT: ucvtf s4, x8 +; CHECK-SD-NEXT: ucvtf s5, x9 +; CHECK-SD-NEXT: ucvtf s2, x11 +; CHECK-SD-NEXT: fmov x9, d1 +; CHECK-SD-NEXT: fmov x11, d3 +; CHECK-SD-NEXT: mov x8, v1.d[1] +; CHECK-SD-NEXT: ucvtf s1, x9 +; CHECK-SD-NEXT: mov v0.s[1], v4.s[0] +; CHECK-SD-NEXT: ucvtf s3, x11 +; CHECK-SD-NEXT: mov v2.s[1], v5.s[0] +; CHECK-SD-NEXT: ucvtf s4, x8 +; CHECK-SD-NEXT: mov v0.s[2], v1.s[0] +; CHECK-SD-NEXT: ucvtf s1, x10 +; CHECK-SD-NEXT: mov v2.s[2], v3.s[0] +; CHECK-SD-NEXT: mov v0.s[3], v4.s[0] +; CHECK-SD-NEXT: mov v2.s[3], v1.s[0] +; CHECK-SD-NEXT: mov v1.16b, v2.16b ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: utofp_v8i64_v8f32: @@ -4591,50 +4681,148 @@ entry: } define <16 x float> @stofp_v16i64_v16f32(<16 x i64> %a) { -; CHECK-LABEL: stofp_v16i64_v16f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: scvtf v4.2d, v4.2d -; CHECK-NEXT: scvtf v6.2d, v6.2d -; CHECK-NEXT: scvtf v16.2d, v1.2d -; CHECK-NEXT: scvtf v17.2d, v3.2d -; CHECK-NEXT: scvtf v5.2d, v5.2d -; CHECK-NEXT: scvtf v7.2d, v7.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v2.2d -; CHECK-NEXT: fcvtn v2.2s, v4.2d -; CHECK-NEXT: fcvtn v3.2s, v6.2d -; CHECK-NEXT: fcvtn2 v0.4s, v16.2d -; CHECK-NEXT: fcvtn2 v1.4s, v17.2d -; CHECK-NEXT: fcvtn2 v2.4s, v5.2d -; CHECK-NEXT: fcvtn2 v3.4s, v7.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: stofp_v16i64_v16f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x13, d2 +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: mov x10, v2.d[1] +; CHECK-SD-NEXT: fmov x11, d0 +; CHECK-SD-NEXT: mov x12, v4.d[1] +; CHECK-SD-NEXT: mov x8, v1.d[1] +; CHECK-SD-NEXT: scvtf s16, x13 +; CHECK-SD-NEXT: fmov x13, d4 +; CHECK-SD-NEXT: scvtf s0, x11 +; CHECK-SD-NEXT: mov x11, v6.d[1] +; CHECK-SD-NEXT: scvtf s17, x9 +; CHECK-SD-NEXT: scvtf s18, x10 +; CHECK-SD-NEXT: fmov x9, d1 +; CHECK-SD-NEXT: scvtf s1, x12 +; CHECK-SD-NEXT: fmov x12, d6 +; CHECK-SD-NEXT: scvtf s2, x13 +; CHECK-SD-NEXT: fmov x13, d3 +; CHECK-SD-NEXT: mov x10, v3.d[1] +; CHECK-SD-NEXT: scvtf s4, x11 +; CHECK-SD-NEXT: mov v0.s[1], v17.s[0] +; CHECK-SD-NEXT: scvtf s6, x9 +; CHECK-SD-NEXT: scvtf s3, x12 +; CHECK-SD-NEXT: mov v16.s[1], v18.s[0] +; CHECK-SD-NEXT: mov x9, v5.d[1] +; CHECK-SD-NEXT: fmov x11, d5 +; CHECK-SD-NEXT: scvtf s5, x13 +; CHECK-SD-NEXT: fmov x13, d7 +; CHECK-SD-NEXT: mov x12, v7.d[1] +; CHECK-SD-NEXT: mov v2.s[1], v1.s[0] +; CHECK-SD-NEXT: mov v0.s[2], v6.s[0] +; CHECK-SD-NEXT: scvtf s6, x10 +; CHECK-SD-NEXT: scvtf s7, x11 +; CHECK-SD-NEXT: scvtf s1, x13 +; CHECK-SD-NEXT: mov v3.s[1], v4.s[0] +; CHECK-SD-NEXT: mov v16.s[2], v5.s[0] +; CHECK-SD-NEXT: scvtf s4, x8 +; CHECK-SD-NEXT: scvtf s5, x9 +; CHECK-SD-NEXT: mov v2.s[2], v7.s[0] +; CHECK-SD-NEXT: mov v3.s[2], v1.s[0] +; CHECK-SD-NEXT: scvtf s1, x12 +; CHECK-SD-NEXT: mov v16.s[3], v6.s[0] +; CHECK-SD-NEXT: mov v0.s[3], v4.s[0] +; CHECK-SD-NEXT: mov v2.s[3], v5.s[0] +; CHECK-SD-NEXT: mov v3.s[3], v1.s[0] +; CHECK-SD-NEXT: mov v1.16b, v16.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: stofp_v16i64_v16f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: scvtf v0.2d, v0.2d +; CHECK-GI-NEXT: scvtf v2.2d, v2.2d +; CHECK-GI-NEXT: scvtf v4.2d, v4.2d +; CHECK-GI-NEXT: scvtf v6.2d, v6.2d +; CHECK-GI-NEXT: scvtf v16.2d, v1.2d +; CHECK-GI-NEXT: scvtf v17.2d, v3.2d +; CHECK-GI-NEXT: scvtf v5.2d, v5.2d +; CHECK-GI-NEXT: scvtf v7.2d, v7.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: fcvtn v1.2s, v2.2d +; CHECK-GI-NEXT: fcvtn v2.2s, v4.2d +; CHECK-GI-NEXT: fcvtn v3.2s, v6.2d +; CHECK-GI-NEXT: fcvtn2 v0.4s, v16.2d +; CHECK-GI-NEXT: fcvtn2 v1.4s, v17.2d +; CHECK-GI-NEXT: fcvtn2 v2.4s, v5.2d +; CHECK-GI-NEXT: fcvtn2 v3.4s, v7.2d +; CHECK-GI-NEXT: ret entry: %c = sitofp <16 x i64> %a to <16 x float> ret <16 x float> %c } define <16 x float> @utofp_v16i64_v16f32(<16 x i64> %a) { -; CHECK-LABEL: utofp_v16i64_v16f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v4.2d, v4.2d -; CHECK-NEXT: ucvtf v6.2d, v6.2d -; CHECK-NEXT: ucvtf v16.2d, v1.2d -; CHECK-NEXT: ucvtf v17.2d, v3.2d -; CHECK-NEXT: ucvtf v5.2d, v5.2d -; CHECK-NEXT: ucvtf v7.2d, v7.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v2.2d -; CHECK-NEXT: fcvtn v2.2s, v4.2d -; CHECK-NEXT: fcvtn v3.2s, v6.2d -; CHECK-NEXT: fcvtn2 v0.4s, v16.2d -; CHECK-NEXT: fcvtn2 v1.4s, v17.2d -; CHECK-NEXT: fcvtn2 v2.4s, v5.2d -; CHECK-NEXT: fcvtn2 v3.4s, v7.2d -; CHECK-NEXT: ret +; CHECK-SD-LABEL: utofp_v16i64_v16f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov x13, d2 +; CHECK-SD-NEXT: mov x9, v0.d[1] +; CHECK-SD-NEXT: mov x10, v2.d[1] +; CHECK-SD-NEXT: fmov x11, d0 +; CHECK-SD-NEXT: mov x12, v4.d[1] +; CHECK-SD-NEXT: mov x8, v1.d[1] +; CHECK-SD-NEXT: ucvtf s16, x13 +; CHECK-SD-NEXT: fmov x13, d4 +; CHECK-SD-NEXT: ucvtf s0, x11 +; CHECK-SD-NEXT: mov x11, v6.d[1] +; CHECK-SD-NEXT: ucvtf s17, x9 +; CHECK-SD-NEXT: ucvtf s18, x10 +; CHECK-SD-NEXT: fmov x9, d1 +; CHECK-SD-NEXT: ucvtf s1, x12 +; CHECK-SD-NEXT: fmov x12, d6 +; CHECK-SD-NEXT: ucvtf s2, x13 +; CHECK-SD-NEXT: fmov x13, d3 +; CHECK-SD-NEXT: mov x10, v3.d[1] +; CHECK-SD-NEXT: ucvtf s4, x11 +; CHECK-SD-NEXT: mov v0.s[1], v17.s[0] +; CHECK-SD-NEXT: ucvtf s6, x9 +; CHECK-SD-NEXT: ucvtf s3, x12 +; CHECK-SD-NEXT: mov v16.s[1], v18.s[0] +; CHECK-SD-NEXT: mov x9, v5.d[1] +; CHECK-SD-NEXT: fmov x11, d5 +; CHECK-SD-NEXT: ucvtf s5, x13 +; CHECK-SD-NEXT: fmov x13, d7 +; CHECK-SD-NEXT: mov x12, v7.d[1] +; CHECK-SD-NEXT: mov v2.s[1], v1.s[0] +; CHECK-SD-NEXT: mov v0.s[2], v6.s[0] +; CHECK-SD-NEXT: ucvtf s6, x10 +; CHECK-SD-NEXT: ucvtf s7, x11 +; CHECK-SD-NEXT: ucvtf s1, x13 +; CHECK-SD-NEXT: mov v3.s[1], v4.s[0] +; CHECK-SD-NEXT: mov v16.s[2], v5.s[0] +; CHECK-SD-NEXT: ucvtf s4, x8 +; CHECK-SD-NEXT: ucvtf s5, x9 +; CHECK-SD-NEXT: mov v2.s[2], v7.s[0] +; CHECK-SD-NEXT: mov v3.s[2], v1.s[0] +; CHECK-SD-NEXT: ucvtf s1, x12 +; CHECK-SD-NEXT: mov v16.s[3], v6.s[0] +; CHECK-SD-NEXT: mov v0.s[3], v4.s[0] +; CHECK-SD-NEXT: mov v2.s[3], v5.s[0] +; CHECK-SD-NEXT: mov v3.s[3], v1.s[0] +; CHECK-SD-NEXT: mov v1.16b, v16.16b +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: utofp_v16i64_v16f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d +; CHECK-GI-NEXT: ucvtf v2.2d, v2.2d +; CHECK-GI-NEXT: ucvtf v4.2d, v4.2d +; CHECK-GI-NEXT: ucvtf v6.2d, v6.2d +; CHECK-GI-NEXT: ucvtf v16.2d, v1.2d +; CHECK-GI-NEXT: ucvtf v17.2d, v3.2d +; CHECK-GI-NEXT: ucvtf v5.2d, v5.2d +; CHECK-GI-NEXT: ucvtf v7.2d, v7.2d +; CHECK-GI-NEXT: fcvtn v0.2s, v0.2d +; CHECK-GI-NEXT: fcvtn v1.2s, v2.2d +; CHECK-GI-NEXT: fcvtn v2.2s, v4.2d +; CHECK-GI-NEXT: fcvtn v3.2s, v6.2d +; CHECK-GI-NEXT: fcvtn2 v0.4s, v16.2d +; CHECK-GI-NEXT: fcvtn2 v1.4s, v17.2d +; CHECK-GI-NEXT: fcvtn2 v2.4s, v5.2d +; CHECK-GI-NEXT: fcvtn2 v3.4s, v7.2d +; CHECK-GI-NEXT: ret entry: %c = uitofp <16 x i64> %a to <16 x float> ret <16 x float> %c @@ -4643,42 +4831,99 @@ entry: define <32 x float> @stofp_v32i64_v32f32(<32 x i64> %a) { ; CHECK-SD-LABEL: stofp_v32i64_v32f32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldp q17, q16, [sp, #64] -; CHECK-SD-NEXT: scvtf v0.2d, v0.2d -; CHECK-SD-NEXT: ldp q19, q18, [sp, #32] -; CHECK-SD-NEXT: scvtf v2.2d, v2.2d -; CHECK-SD-NEXT: ldp q21, q20, [sp] -; CHECK-SD-NEXT: scvtf v4.2d, v4.2d -; CHECK-SD-NEXT: ldp q23, q22, [sp, #96] -; CHECK-SD-NEXT: scvtf v6.2d, v6.2d -; CHECK-SD-NEXT: scvtf v19.2d, v19.2d -; CHECK-SD-NEXT: scvtf v17.2d, v17.2d -; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d -; CHECK-SD-NEXT: scvtf v21.2d, v21.2d -; CHECK-SD-NEXT: scvtf v24.2d, v1.2d -; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d -; CHECK-SD-NEXT: scvtf v23.2d, v23.2d -; CHECK-SD-NEXT: scvtf v25.2d, v3.2d -; CHECK-SD-NEXT: fcvtn v2.2s, v4.2d -; CHECK-SD-NEXT: scvtf v26.2d, v5.2d -; CHECK-SD-NEXT: fcvtn v3.2s, v6.2d -; CHECK-SD-NEXT: scvtf v27.2d, v7.2d -; CHECK-SD-NEXT: scvtf v20.2d, v20.2d -; CHECK-SD-NEXT: fcvtn v5.2s, v19.2d -; CHECK-SD-NEXT: scvtf v18.2d, v18.2d -; CHECK-SD-NEXT: fcvtn v4.2s, v21.2d -; CHECK-SD-NEXT: fcvtn v6.2s, v17.2d -; CHECK-SD-NEXT: scvtf v16.2d, v16.2d -; CHECK-SD-NEXT: fcvtn v7.2s, v23.2d -; CHECK-SD-NEXT: scvtf v17.2d, v22.2d -; CHECK-SD-NEXT: fcvtn2 v0.4s, v24.2d -; CHECK-SD-NEXT: fcvtn2 v1.4s, v25.2d -; CHECK-SD-NEXT: fcvtn2 v2.4s, v26.2d -; CHECK-SD-NEXT: fcvtn2 v3.4s, v27.2d -; CHECK-SD-NEXT: fcvtn2 v5.4s, v18.2d -; CHECK-SD-NEXT: fcvtn2 v4.4s, v20.2d -; CHECK-SD-NEXT: fcvtn2 v6.4s, v16.2d -; CHECK-SD-NEXT: fcvtn2 v7.4s, v17.2d +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: mov v16.16b, v1.16b +; CHECK-SD-NEXT: fmov x11, d2 +; CHECK-SD-NEXT: ldp q24, q20, [sp] +; CHECK-SD-NEXT: mov x9, v2.d[1] +; CHECK-SD-NEXT: fmov x12, d3 +; CHECK-SD-NEXT: fmov x13, d4 +; CHECK-SD-NEXT: scvtf s0, x10 +; CHECK-SD-NEXT: ldp q21, q18, [sp, #32] +; CHECK-SD-NEXT: scvtf s2, x8 +; CHECK-SD-NEXT: scvtf s1, x11 +; CHECK-SD-NEXT: mov x10, v4.d[1] +; CHECK-SD-NEXT: fmov x11, d16 +; CHECK-SD-NEXT: ldp q19, q17, [sp, #96] +; CHECK-SD-NEXT: scvtf s22, x9 +; CHECK-SD-NEXT: mov x8, v3.d[1] +; CHECK-SD-NEXT: scvtf s4, x12 +; CHECK-SD-NEXT: mov x12, v24.d[1] +; CHECK-SD-NEXT: mov x9, v16.d[1] +; CHECK-SD-NEXT: scvtf s3, x11 +; CHECK-SD-NEXT: ldp q23, q16, [sp, #64] +; CHECK-SD-NEXT: mov v0.s[1], v2.s[0] +; CHECK-SD-NEXT: scvtf s25, x10 +; CHECK-SD-NEXT: fmov x10, d6 +; CHECK-SD-NEXT: mov v1.s[1], v22.s[0] +; CHECK-SD-NEXT: mov x11, v6.d[1] +; CHECK-SD-NEXT: scvtf s2, x13 +; CHECK-SD-NEXT: mov x13, v21.d[1] +; CHECK-SD-NEXT: fmov x14, d19 +; CHECK-SD-NEXT: scvtf s22, x9 +; CHECK-SD-NEXT: mov x9, v5.d[1] +; CHECK-SD-NEXT: fmov x15, d17 +; CHECK-SD-NEXT: mov v0.s[2], v3.s[0] +; CHECK-SD-NEXT: scvtf s3, x10 +; CHECK-SD-NEXT: fmov x10, d24 +; CHECK-SD-NEXT: mov v1.s[2], v4.s[0] +; CHECK-SD-NEXT: scvtf s24, x12 +; CHECK-SD-NEXT: scvtf s6, x11 +; CHECK-SD-NEXT: fmov x11, d5 +; CHECK-SD-NEXT: fmov x12, d7 +; CHECK-SD-NEXT: mov v2.s[1], v25.s[0] +; CHECK-SD-NEXT: scvtf s4, x10 +; CHECK-SD-NEXT: fmov x10, d21 +; CHECK-SD-NEXT: scvtf s21, x8 +; CHECK-SD-NEXT: mov x8, v23.d[1] +; CHECK-SD-NEXT: scvtf s25, x13 +; CHECK-SD-NEXT: mov x13, v19.d[1] +; CHECK-SD-NEXT: scvtf s26, x11 +; CHECK-SD-NEXT: mov x11, v20.d[1] +; CHECK-SD-NEXT: mov v3.s[1], v6.s[0] +; CHECK-SD-NEXT: scvtf s5, x10 +; CHECK-SD-NEXT: mov x10, v7.d[1] +; CHECK-SD-NEXT: scvtf s7, x14 +; CHECK-SD-NEXT: mov v4.s[1], v24.s[0] +; CHECK-SD-NEXT: scvtf s24, x12 +; CHECK-SD-NEXT: fmov x12, d20 +; CHECK-SD-NEXT: scvtf s20, x8 +; CHECK-SD-NEXT: fmov x8, d23 +; CHECK-SD-NEXT: scvtf s19, x13 +; CHECK-SD-NEXT: fmov x13, d18 +; CHECK-SD-NEXT: fmov x14, d16 +; CHECK-SD-NEXT: mov v2.s[2], v26.s[0] +; CHECK-SD-NEXT: mov v5.s[1], v25.s[0] +; CHECK-SD-NEXT: scvtf s23, x10 +; CHECK-SD-NEXT: mov v0.s[3], v22.s[0] +; CHECK-SD-NEXT: scvtf s6, x8 +; CHECK-SD-NEXT: mov x8, v18.d[1] +; CHECK-SD-NEXT: scvtf s18, x12 +; CHECK-SD-NEXT: mov x12, v16.d[1] +; CHECK-SD-NEXT: scvtf s16, x13 +; CHECK-SD-NEXT: mov x13, v17.d[1] +; CHECK-SD-NEXT: scvtf s17, x14 +; CHECK-SD-NEXT: mov v7.s[1], v19.s[0] +; CHECK-SD-NEXT: scvtf s19, x9 +; CHECK-SD-NEXT: mov v3.s[2], v24.s[0] +; CHECK-SD-NEXT: scvtf s24, x11 +; CHECK-SD-NEXT: mov v1.s[3], v21.s[0] +; CHECK-SD-NEXT: mov v6.s[1], v20.s[0] +; CHECK-SD-NEXT: scvtf s20, x15 +; CHECK-SD-NEXT: mov v4.s[2], v18.s[0] +; CHECK-SD-NEXT: scvtf s18, x8 +; CHECK-SD-NEXT: mov v5.s[2], v16.s[0] +; CHECK-SD-NEXT: scvtf s16, x12 +; CHECK-SD-NEXT: mov v2.s[3], v19.s[0] +; CHECK-SD-NEXT: mov v3.s[3], v23.s[0] +; CHECK-SD-NEXT: mov v6.s[2], v17.s[0] +; CHECK-SD-NEXT: mov v7.s[2], v20.s[0] +; CHECK-SD-NEXT: scvtf s17, x13 +; CHECK-SD-NEXT: mov v4.s[3], v24.s[0] +; CHECK-SD-NEXT: mov v5.s[3], v18.s[0] +; CHECK-SD-NEXT: mov v6.s[3], v16.s[0] +; CHECK-SD-NEXT: mov v7.s[3], v17.s[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: stofp_v32i64_v32f32: @@ -4728,42 +4973,99 @@ entry: define <32 x float> @utofp_v32i64_v32f32(<32 x i64> %a) { ; CHECK-SD-LABEL: utofp_v32i64_v32f32: ; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: ldp q17, q16, [sp, #64] -; CHECK-SD-NEXT: ucvtf v0.2d, v0.2d -; CHECK-SD-NEXT: ldp q19, q18, [sp, #32] -; CHECK-SD-NEXT: ucvtf v2.2d, v2.2d -; CHECK-SD-NEXT: ldp q21, q20, [sp] -; CHECK-SD-NEXT: ucvtf v4.2d, v4.2d -; CHECK-SD-NEXT: ldp q23, q22, [sp, #96] -; CHECK-SD-NEXT: ucvtf v6.2d, v6.2d -; CHECK-SD-NEXT: ucvtf v19.2d, v19.2d -; CHECK-SD-NEXT: ucvtf v17.2d, v17.2d -; CHECK-SD-NEXT: fcvtn v0.2s, v0.2d -; CHECK-SD-NEXT: ucvtf v21.2d, v21.2d -; CHECK-SD-NEXT: ucvtf v24.2d, v1.2d -; CHECK-SD-NEXT: fcvtn v1.2s, v2.2d -; CHECK-SD-NEXT: ucvtf v23.2d, v23.2d -; CHECK-SD-NEXT: ucvtf v25.2d, v3.2d -; CHECK-SD-NEXT: fcvtn v2.2s, v4.2d -; CHECK-SD-NEXT: ucvtf v26.2d, v5.2d -; CHECK-SD-NEXT: fcvtn v3.2s, v6.2d -; CHECK-SD-NEXT: ucvtf v27.2d, v7.2d -; CHECK-SD-NEXT: ucvtf v20.2d, v20.2d -; CHECK-SD-NEXT: fcvtn v5.2s, v19.2d -; CHECK-SD-NEXT: ucvtf v18.2d, v18.2d -; CHECK-SD-NEXT: fcvtn v4.2s, v21.2d -; CHECK-SD-NEXT: fcvtn v6.2s, v17.2d -; CHECK-SD-NEXT: ucvtf v16.2d, v16.2d -; CHECK-SD-NEXT: fcvtn v7.2s, v23.2d -; CHECK-SD-NEXT: ucvtf v17.2d, v22.2d -; CHECK-SD-NEXT: fcvtn2 v0.4s, v24.2d -; CHECK-SD-NEXT: fcvtn2 v1.4s, v25.2d -; CHECK-SD-NEXT: fcvtn2 v2.4s, v26.2d -; CHECK-SD-NEXT: fcvtn2 v3.4s, v27.2d -; CHECK-SD-NEXT: fcvtn2 v5.4s, v18.2d -; CHECK-SD-NEXT: fcvtn2 v4.4s, v20.2d -; CHECK-SD-NEXT: fcvtn2 v6.4s, v16.2d -; CHECK-SD-NEXT: fcvtn2 v7.4s, v17.2d +; CHECK-SD-NEXT: mov x8, v0.d[1] +; CHECK-SD-NEXT: fmov x10, d0 +; CHECK-SD-NEXT: mov v16.16b, v1.16b +; CHECK-SD-NEXT: fmov x11, d2 +; CHECK-SD-NEXT: ldp q24, q20, [sp] +; CHECK-SD-NEXT: mov x9, v2.d[1] +; CHECK-SD-NEXT: fmov x12, d3 +; CHECK-SD-NEXT: fmov x13, d4 +; CHECK-SD-NEXT: ucvtf s0, x10 +; CHECK-SD-NEXT: ldp q21, q18, [sp, #32] +; CHECK-SD-NEXT: ucvtf s2, x8 +; CHECK-SD-NEXT: ucvtf s1, x11 +; CHECK-SD-NEXT: mov x10, v4.d[1] +; CHECK-SD-NEXT: fmov x11, d16 +; CHECK-SD-NEXT: ldp q19, q17, [sp, #96] +; CHECK-SD-NEXT: ucvtf s22, x9 +; CHECK-SD-NEXT: mov x8, v3.d[1] +; CHECK-SD-NEXT: ucvtf s4, x12 +; CHECK-SD-NEXT: mov x12, v24.d[1] +; CHECK-SD-NEXT: mov x9, v16.d[1] +; CHECK-SD-NEXT: ucvtf s3, x11 +; CHECK-SD-NEXT: ldp q23, q16, [sp, #64] +; CHECK-SD-NEXT: mov v0.s[1], v2.s[0] +; CHECK-SD-NEXT: ucvtf s25, x10 +; CHECK-SD-NEXT: fmov x10, d6 +; CHECK-SD-NEXT: mov v1.s[1], v22.s[0] +; CHECK-SD-NEXT: mov x11, v6.d[1] +; CHECK-SD-NEXT: ucvtf s2, x13 +; CHECK-SD-NEXT: mov x13, v21.d[1] +; CHECK-SD-NEXT: fmov x14, d19 +; CHECK-SD-NEXT: ucvtf s22, x9 +; CHECK-SD-NEXT: mov x9, v5.d[1] +; CHECK-SD-NEXT: fmov x15, d17 +; CHECK-SD-NEXT: mov v0.s[2], v3.s[0] +; CHECK-SD-NEXT: ucvtf s3, x10 +; CHECK-SD-NEXT: fmov x10, d24 +; CHECK-SD-NEXT: mov v1.s[2], v4.s[0] +; CHECK-SD-NEXT: ucvtf s24, x12 +; CHECK-SD-NEXT: ucvtf s6, x11 +; CHECK-SD-NEXT: fmov x11, d5 +; CHECK-SD-NEXT: fmov x12, d7 +; CHECK-SD-NEXT: mov v2.s[1], v25.s[0] +; CHECK-SD-NEXT: ucvtf s4, x10 +; CHECK-SD-NEXT: fmov x10, d21 +; CHECK-SD-NEXT: ucvtf s21, x8 +; CHECK-SD-NEXT: mov x8, v23.d[1] +; CHECK-SD-NEXT: ucvtf s25, x13 +; CHECK-SD-NEXT: mov x13, v19.d[1] +; CHECK-SD-NEXT: ucvtf s26, x11 +; CHECK-SD-NEXT: mov x11, v20.d[1] +; CHECK-SD-NEXT: mov v3.s[1], v6.s[0] +; CHECK-SD-NEXT: ucvtf s5, x10 +; CHECK-SD-NEXT: mov x10, v7.d[1] +; CHECK-SD-NEXT: ucvtf s7, x14 +; CHECK-SD-NEXT: mov v4.s[1], v24.s[0] +; CHECK-SD-NEXT: ucvtf s24, x12 +; CHECK-SD-NEXT: fmov x12, d20 +; CHECK-SD-NEXT: ucvtf s20, x8 +; CHECK-SD-NEXT: fmov x8, d23 +; CHECK-SD-NEXT: ucvtf s19, x13 +; CHECK-SD-NEXT: fmov x13, d18 +; CHECK-SD-NEXT: fmov x14, d16 +; CHECK-SD-NEXT: mov v2.s[2], v26.s[0] +; CHECK-SD-NEXT: mov v5.s[1], v25.s[0] +; CHECK-SD-NEXT: ucvtf s23, x10 +; CHECK-SD-NEXT: mov v0.s[3], v22.s[0] +; CHECK-SD-NEXT: ucvtf s6, x8 +; CHECK-SD-NEXT: mov x8, v18.d[1] +; CHECK-SD-NEXT: ucvtf s18, x12 +; CHECK-SD-NEXT: mov x12, v16.d[1] +; CHECK-SD-NEXT: ucvtf s16, x13 +; CHECK-SD-NEXT: mov x13, v17.d[1] +; CHECK-SD-NEXT: ucvtf s17, x14 +; CHECK-SD-NEXT: mov v7.s[1], v19.s[0] +; CHECK-SD-NEXT: ucvtf s19, x9 +; CHECK-SD-NEXT: mov v3.s[2], v24.s[0] +; CHECK-SD-NEXT: ucvtf s24, x11 +; CHECK-SD-NEXT: mov v1.s[3], v21.s[0] +; CHECK-SD-NEXT: mov v6.s[1], v20.s[0] +; CHECK-SD-NEXT: ucvtf s20, x15 +; CHECK-SD-NEXT: mov v4.s[2], v18.s[0] +; CHECK-SD-NEXT: ucvtf s18, x8 +; CHECK-SD-NEXT: mov v5.s[2], v16.s[0] +; CHECK-SD-NEXT: ucvtf s16, x12 +; CHECK-SD-NEXT: mov v2.s[3], v19.s[0] +; CHECK-SD-NEXT: mov v3.s[3], v23.s[0] +; CHECK-SD-NEXT: mov v6.s[2], v17.s[0] +; CHECK-SD-NEXT: mov v7.s[2], v20.s[0] +; CHECK-SD-NEXT: ucvtf s17, x13 +; CHECK-SD-NEXT: mov v4.s[3], v24.s[0] +; CHECK-SD-NEXT: mov v5.s[3], v18.s[0] +; CHECK-SD-NEXT: mov v6.s[3], v16.s[0] +; CHECK-SD-NEXT: mov v7.s[3], v17.s[0] ; CHECK-SD-NEXT: ret ; ; CHECK-GI-LABEL: utofp_v32i64_v32f32: diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll index 573fe3d8b8a77..1d9e01f4ecfdf 100644 --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-to-fp.ll @@ -722,8 +722,11 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 { ; CHECK-LABEL: ucvtf_v1i64_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: ucvtf s0, x8 +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %res = uitofp <1 x i64> %op1 to <1 x float> ret <1 x float> %res @@ -733,8 +736,12 @@ define <1 x float> @ucvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 { define <2 x float> @ucvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 { ; CHECK-LABEL: ucvtf_v2i64_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: ucvtf s0, x9 +; CHECK-NEXT: ucvtf s1, x8 +; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %res = uitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res @@ -1646,8 +1653,11 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 { ; CHECK-LABEL: scvtf_v1i64_v1f32: ; CHECK: // %bb.0: ; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: movi d1, #0000000000000000 +; CHECK-NEXT: scvtf s0, x8 +; CHECK-NEXT: mov v1.s[0], v0.s[0] +; CHECK-NEXT: fmov d0, d1 ; CHECK-NEXT: ret %res = sitofp <1 x i64> %op1 to <1 x float> ret <1 x float> %res @@ -1657,8 +1667,12 @@ define <1 x float> @scvtf_v1i64_v1f32(<1 x i64> %op1) vscale_range(2,0) #0 { define <2 x float> @scvtf_v2i64_v2f32(<2 x i64> %op1) vscale_range(2,0) #0 { ; CHECK-LABEL: scvtf_v2i64_v2f32: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: scvtf s0, x9 +; CHECK-NEXT: scvtf s1, x8 +; CHECK-NEXT: mov v0.s[1], v1.s[0] +; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-NEXT: ret %res = sitofp <2 x i64> %op1 to <2 x float> ret <2 x float> %res diff --git a/llvm/test/CodeGen/AArch64/vector-fcvt.ll b/llvm/test/CodeGen/AArch64/vector-fcvt.ll index 8f38bdbedc629..a6b43d514594e 100644 --- a/llvm/test/CodeGen/AArch64/vector-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/vector-fcvt.ll @@ -87,14 +87,29 @@ define <8 x float> @sitofp_i32_float(<8 x i32> %a) { define <8 x float> @sitofp_i64_float(<8 x i64> %a) { ; CHECK-LABEL: sitofp_i64_float: ; CHECK: // %bb.0: -; CHECK-NEXT: scvtf v0.2d, v0.2d -; CHECK-NEXT: scvtf v2.2d, v2.2d -; CHECK-NEXT: scvtf v4.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v2.2d -; CHECK-NEXT: scvtf v2.2d, v3.2d -; CHECK-NEXT: fcvtn2 v0.4s, v4.2d -; CHECK-NEXT: fcvtn2 v1.4s, v2.2d +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: scvtf s0, x10 +; CHECK-NEXT: mov x10, v3.d[1] +; CHECK-NEXT: scvtf s4, x8 +; CHECK-NEXT: scvtf s5, x9 +; CHECK-NEXT: scvtf s2, x11 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: scvtf s1, x9 +; CHECK-NEXT: mov v0.s[1], v4.s[0] +; CHECK-NEXT: scvtf s3, x11 +; CHECK-NEXT: mov v2.s[1], v5.s[0] +; CHECK-NEXT: scvtf s4, x8 +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: scvtf s1, x10 +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: mov v0.s[3], v4.s[0] +; CHECK-NEXT: mov v2.s[3], v1.s[0] +; CHECK-NEXT: mov v1.16b, v2.16b ; CHECK-NEXT: ret %1 = sitofp <8 x i64> %a to <8 x float> ret <8 x float> %1 @@ -177,14 +192,29 @@ define <8 x float> @uitofp_i32_float(<8 x i32> %a) { define <8 x float> @uitofp_i64_float(<8 x i64> %a) { ; CHECK-LABEL: uitofp_i64_float: ; CHECK: // %bb.0: -; CHECK-NEXT: ucvtf v0.2d, v0.2d -; CHECK-NEXT: ucvtf v2.2d, v2.2d -; CHECK-NEXT: ucvtf v4.2d, v1.2d -; CHECK-NEXT: fcvtn v0.2s, v0.2d -; CHECK-NEXT: fcvtn v1.2s, v2.2d -; CHECK-NEXT: ucvtf v2.2d, v3.2d -; CHECK-NEXT: fcvtn2 v0.4s, v4.2d -; CHECK-NEXT: fcvtn2 v1.4s, v2.2d +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: mov x9, v2.d[1] +; CHECK-NEXT: fmov x10, d0 +; CHECK-NEXT: fmov x11, d2 +; CHECK-NEXT: ucvtf s0, x10 +; CHECK-NEXT: mov x10, v3.d[1] +; CHECK-NEXT: ucvtf s4, x8 +; CHECK-NEXT: ucvtf s5, x9 +; CHECK-NEXT: ucvtf s2, x11 +; CHECK-NEXT: fmov x9, d1 +; CHECK-NEXT: fmov x11, d3 +; CHECK-NEXT: mov x8, v1.d[1] +; CHECK-NEXT: ucvtf s1, x9 +; CHECK-NEXT: mov v0.s[1], v4.s[0] +; CHECK-NEXT: ucvtf s3, x11 +; CHECK-NEXT: mov v2.s[1], v5.s[0] +; CHECK-NEXT: ucvtf s4, x8 +; CHECK-NEXT: mov v0.s[2], v1.s[0] +; CHECK-NEXT: ucvtf s1, x10 +; CHECK-NEXT: mov v2.s[2], v3.s[0] +; CHECK-NEXT: mov v0.s[3], v4.s[0] +; CHECK-NEXT: mov v2.s[3], v1.s[0] +; CHECK-NEXT: mov v1.16b, v2.16b ; CHECK-NEXT: ret %1 = uitofp <8 x i64> %a to <8 x float> ret <8 x float> %1