From c331c4c260b6432b6ae96723f78c16b189e9297a Mon Sep 17 00:00:00 2001 From: Marian Lukac Date: Thu, 20 Feb 2025 15:35:45 +0000 Subject: [PATCH 1/5] [Clang][AArch64] Add fp8 variants for untyped NEON intrinsics This patch adds fp8 variants to existing intrinsics, whose operation doesn't depend on arguments being a specific type. --- clang/include/clang/Basic/arm_neon.td | 74 +- clang/lib/AST/Type.cpp | 5 + clang/lib/CodeGen/CGCall.cpp | 9 + clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 20 + clang/lib/Sema/SemaInit.cpp | 2 + .../fp8-intrinsics/acle_neon_fp8_untyped.c | 1114 +++++++++++++++++ 6 files changed, 1220 insertions(+), 4 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ab0051efe5159..90f0e90e4a7f8 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2090,17 +2090,17 @@ let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "r // Lookup table read with 2-bit/4-bit indices let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { - def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcQcQUcQPc", + def VLUTI2_B : SInst<"vluti2_lane", "Q.(qU)I", "cUcPcmQcQUcQPcQm", [ImmCheck<2, ImmCheck0_1>]>; - def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcQcQUcQPc", + def VLUTI2_B_Q : SInst<"vluti2_laneq", "Q.(QU)I", "cUcPcmQcQUcQPcQm", [ImmCheck<2, ImmCheck0_3>]>; def VLUTI2_H : SInst<"vluti2_lane", "Q.(]>; def VLUTI2_H_Q : SInst<"vluti2_laneq", "Q.(]>; - def VLUTI4_B : SInst<"vluti4_lane", "..(qU)I", "QcQUcQPc", + def VLUTI4_B : SInst<"vluti4_lane", "..(qU)I", "QcQUcQPcQm", [ImmCheck<2, ImmCheck0_0>]>; - def VLUTI4_B_Q : SInst<"vluti4_laneq", "..UI", "QcQUcQPc", + def VLUTI4_B_Q : SInst<"vluti4_laneq", "..UI", "QcQUcQPcQm", [ImmCheck<2, ImmCheck0_1>]>; def VLUTI4_H_X2 : SInst<"vluti4_lane_x2", ".2(]>; @@ -2194,4 +2194,70 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { // fscale def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">; def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">; +} + +//FP8 versions of untyped intrinsics +let ArchGuard = "defined(__aarch64__)" in { + def VGET_LANE_MF8 : IInst<"vget_lane", "1.I", "mQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; + def SPLAT_MF8 : WInst<"splat_lane", ".(!q)I", "mQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; + def SPLATQ_MF8 : WInst<"splat_laneq", ".(!Q)I", "mQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; + def VSET_LANE_MF8 : IInst<"vset_lane", ".1.I", "mQm", [ImmCheck<2, ImmCheckLaneIndex, 1>]>; + def VCREATE_MF8 : NoTestOpInst<"vcreate", ".(IU>)", "m", OP_CAST> { let BigEndianSafe = 1; } + let InstName = "vmov" in { + def VDUP_N_MF8 : WOpInst<"vdup_n", ".1", "mQm", OP_DUP>; + def VMOV_N_MF8 : WOpInst<"vmov_n", ".1", "mQm", OP_DUP>; + } + let InstName = "" in + def VDUP_LANE_MF8: WOpInst<"vdup_lane", ".qI", "mQm", OP_DUP_LN>; + def VCOMBINE_MF8 : NoTestOpInst<"vcombine", "Q..", "m", OP_CONC>; + let InstName = "vmov" in { + def VGET_HIGH_MF8 : NoTestOpInst<"vget_high", ".Q", "m", OP_HI>; + def VGET_LOW_MF8 : NoTestOpInst<"vget_low", ".Q", "m", OP_LO>; + } + let InstName = "vtbl" in { + def VTBL1_MF8 : WInst<"vtbl1", "..p", "m">; + def VTBL2_MF8 : WInst<"vtbl2", ".2p", "m">; + def VTBL3_MF8 : WInst<"vtbl3", ".3p", "m">; + def VTBL4_MF8 : WInst<"vtbl4", ".4p", "m">; + } + let InstName = "vtbx" in { + def VTBX1_MF8 : WInst<"vtbx1", "...p", "m">; + def VTBX2_MF8 : WInst<"vtbx2", "..2p", "m">; + def VTBX3_MF8 : WInst<"vtbx3", "..3p", "m">; + def VTBX4_MF8 : WInst<"vtbx4", "..4p", "m">; + } + def VEXT_MF8 : WInst<"vext", "...I", "mQm", [ImmCheck<2, ImmCheckLaneIndex, 0>]>; + def VREV64_MF8 : WOpInst<"vrev64", "..", "mQm", OP_REV64>; + def VREV32_MF8 : WOpInst<"vrev32", "..", "mQm", OP_REV32>; + def VREV16_MF8 : WOpInst<"vrev16", "..", "mQm", OP_REV16>; + let isHiddenLInst = 1 in + def VBSL_MF8 : SInst<"vbsl", ".U..", "mQm">; + def VTRN_MF8 : WInst<"vtrn", "2..", "mQm">; + def VZIP_MF8 : WInst<"vzip", "2..", "mQm">; + def VUZP_MF8 : WInst<"vuzp", "2..", "mQm">; + def COPY_LANE_MF8 : IOpInst<"vcopy_lane", "..I.I", "m", OP_COPY_LN>; + def COPYQ_LANE_MF8 : IOpInst<"vcopy_lane", "..IqI", "Qm", OP_COPY_LN>; + def COPY_LANEQ_MF8 : IOpInst<"vcopy_laneq", "..IQI", "m", OP_COPY_LN>; + def COPYQ_LANEQ_MF8 : IOpInst<"vcopy_laneq", "..I.I", "Qm", OP_COPY_LN>; + def VDUP_LANE2_MF8 : WOpInst<"vdup_laneq", ".QI", "mQm", OP_DUP_LN>; + def VTRN1_MF8 : SOpInst<"vtrn1", "...", "mQm", OP_TRN1>; + def VZIP1_MF8 : SOpInst<"vzip1", "...", "mQm", OP_ZIP1>; + def VUZP1_MF8 : SOpInst<"vuzp1", "...", "mQm", OP_UZP1>; + def VTRN2_MF8 : SOpInst<"vtrn2", "...", "mQm", OP_TRN2>; + def VZIP2_MF8 : SOpInst<"vzip2", "...", "mQm", OP_ZIP2>; + def VUZP2_MF8 : SOpInst<"vuzp2", "...", "mQm", OP_UZP2>; + let InstName = "vtbl" in { + def VQTBL1_A64_MF8 : WInst<"vqtbl1", ".QU", "mQm">; + def VQTBL2_A64_MF8 : WInst<"vqtbl2", ".(2Q)U", "mQm">; + def VQTBL3_A64_MF8 : WInst<"vqtbl3", ".(3Q)U", "mQm">; + def VQTBL4_A64_MF8 : WInst<"vqtbl4", ".(4Q)U", "mQm">; + } + let InstName = "vtbx" in { + def VQTBX1_A64_MF8 : WInst<"vqtbx1", "..QU", "mQm">; + def VQTBX2_A64_MF8 : WInst<"vqtbx2", "..(2Q)U", "mQm">; + def VQTBX3_A64_MF8 : WInst<"vqtbx3", "..(3Q)U", "mQm">; + def VQTBX4_A64_MF8 : WInst<"vqtbx4", "..(4Q)U", "mQm">; + } + def SCALAR_VDUP_LANE_MF8 : IInst<"vdup_lane", "1.I", "Sm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; + def SCALAR_VDUP_LANEQ_MF8 : IInst<"vdup_laneq", "1QI", "Sm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; } \ No newline at end of file diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index 08798219c0b83..1404cb5b8007f 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2782,6 +2782,11 @@ static bool isTriviallyCopyableTypeImpl(const QualType &type, if (CanonicalType->isScalarType() || CanonicalType->isVectorType()) return true; + // Mfloat8 type is a special case as it not scalar, but is still trivially + // copyable. + if (CanonicalType->isMFloat8Type()) + return true; + if (const auto *RT = CanonicalType->getAs()) { if (const auto *ClassDecl = dyn_cast(RT->getDecl())) { if (IsCopyConstructible) { diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index 3cefa3b0c585c..77a955285aa30 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5464,6 +5464,15 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, Builder.CreateStore(errorValue, swiftErrorTemp); } + // Mfloat8 type is loaded as scalar type, but is treated as single + // vector type for other operations. We need to bitcast it to the vector + // type here. + if (auto *EltTy = + dyn_cast(ArgInfo.getCoerceToType()); + EltTy && EltTy->getNumElements() == 1 && + V->getType() == EltTy->getScalarType()) + V = Builder.CreateBitCast(V, EltTy); + // We might have to widen integers, but we should never truncate. if (ArgInfo.getCoerceToType() != V->getType() && V->getType()->isIntegerTy()) diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index afe25b5418424..b8b9b4c903632 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -2623,22 +2623,26 @@ static bool HasExtraNeonArgument(unsigned BuiltinID) { case NEON::BI__builtin_neon_vget_lane_bf16: case NEON::BI__builtin_neon_vget_lane_i32: case NEON::BI__builtin_neon_vget_lane_i64: + case NEON::BI__builtin_neon_vget_lane_mf8: case NEON::BI__builtin_neon_vget_lane_f32: case NEON::BI__builtin_neon_vgetq_lane_i8: case NEON::BI__builtin_neon_vgetq_lane_i16: case NEON::BI__builtin_neon_vgetq_lane_bf16: case NEON::BI__builtin_neon_vgetq_lane_i32: case NEON::BI__builtin_neon_vgetq_lane_i64: + case NEON::BI__builtin_neon_vgetq_lane_mf8: case NEON::BI__builtin_neon_vgetq_lane_f32: case NEON::BI__builtin_neon_vduph_lane_bf16: case NEON::BI__builtin_neon_vduph_laneq_bf16: case NEON::BI__builtin_neon_vset_lane_i8: + case NEON::BI__builtin_neon_vset_lane_mf8: case NEON::BI__builtin_neon_vset_lane_i16: case NEON::BI__builtin_neon_vset_lane_bf16: case NEON::BI__builtin_neon_vset_lane_i32: case NEON::BI__builtin_neon_vset_lane_i64: case NEON::BI__builtin_neon_vset_lane_f32: case NEON::BI__builtin_neon_vsetq_lane_i8: + case NEON::BI__builtin_neon_vsetq_lane_mf8: case NEON::BI__builtin_neon_vsetq_lane_i16: case NEON::BI__builtin_neon_vsetq_lane_bf16: case NEON::BI__builtin_neon_vsetq_lane_i32: @@ -6161,6 +6165,10 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Builder.CreateBitCast(Ops[1], llvm::FixedVectorType::get(DoubleTy, 1)); Ops.push_back(EmitScalarExpr(E->getArg(2))); return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); + case NEON::BI__builtin_neon_vset_lane_mf8: + case NEON::BI__builtin_neon_vsetq_lane_mf8: + Ops.push_back(EmitScalarExpr(E->getArg(2))); + return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); case NEON::BI__builtin_neon_vsetq_lane_f64: // The vector type needs a cast for the v2f64 variant. Ops[1] = @@ -6180,6 +6188,12 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Builder.CreateBitCast(Ops[0], llvm::FixedVectorType::get(Int8Ty, 16)); return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), "vgetq_lane"); + case NEON::BI__builtin_neon_vget_lane_mf8: + case NEON::BI__builtin_neon_vdupb_lane_mf8: + case NEON::BI__builtin_neon_vgetq_lane_mf8: + case NEON::BI__builtin_neon_vdupb_laneq_mf8: + return Builder.CreateExtractElement(Ops[0], EmitScalarExpr(E->getArg(1)), + "vget_lane"); case NEON::BI__builtin_neon_vget_lane_i16: case NEON::BI__builtin_neon_vduph_lane_i16: Ops[0] = @@ -7629,6 +7643,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vuqadd"); } + case NEON::BI__builtin_neon_vluti2_laneq_mf8: case NEON::BI__builtin_neon_vluti2_laneq_bf16: case NEON::BI__builtin_neon_vluti2_laneq_f16: case NEON::BI__builtin_neon_vluti2_laneq_p16: @@ -7644,6 +7659,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ false)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); } + case NEON::BI__builtin_neon_vluti2q_laneq_mf8: case NEON::BI__builtin_neon_vluti2q_laneq_bf16: case NEON::BI__builtin_neon_vluti2q_laneq_f16: case NEON::BI__builtin_neon_vluti2q_laneq_p16: @@ -7659,6 +7675,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ true)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_laneq"); } + case NEON::BI__builtin_neon_vluti2_lane_mf8: case NEON::BI__builtin_neon_vluti2_lane_bf16: case NEON::BI__builtin_neon_vluti2_lane_f16: case NEON::BI__builtin_neon_vluti2_lane_p16: @@ -7674,6 +7691,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ false)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); } + case NEON::BI__builtin_neon_vluti2q_lane_mf8: case NEON::BI__builtin_neon_vluti2q_lane_bf16: case NEON::BI__builtin_neon_vluti2q_lane_f16: case NEON::BI__builtin_neon_vluti2q_lane_p16: @@ -7689,12 +7707,14 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, /*isQuad*/ true)); return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vluti2_lane"); } + case NEON::BI__builtin_neon_vluti4q_lane_mf8: case NEON::BI__builtin_neon_vluti4q_lane_p8: case NEON::BI__builtin_neon_vluti4q_lane_s8: case NEON::BI__builtin_neon_vluti4q_lane_u8: { Int = Intrinsic::aarch64_neon_vluti4q_lane; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_lane"); } + case NEON::BI__builtin_neon_vluti4q_laneq_mf8: case NEON::BI__builtin_neon_vluti4q_laneq_p8: case NEON::BI__builtin_neon_vluti4q_laneq_s8: case NEON::BI__builtin_neon_vluti4q_laneq_u8: { diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 9814c3f456f0d..890e9c3df7f62 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -1944,6 +1944,8 @@ void InitListChecker::CheckVectorType(const InitializedEntity &Entity, typeCode = "s"; else if (elementType->isUnsignedIntegerType()) typeCode = "u"; + else if (elementType->isMFloat8Type()) + typeCode = "mf"; else llvm_unreachable("Invalid element type!"); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c new file mode 100644 index 0000000000000..fec1a93bdd5e9 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c @@ -0,0 +1,1114 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +#include + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +lut -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,sroa | FileCheck %s + +// REQUIRES: aarch64-registered-target + +// CHECK-LABEL: define dso_local <8 x i8> @test_vset_lane_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[B]], i8 [[TMP0]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VSET_LANE]] +// +mfloat8x8_t test_vset_lane_mf8(mfloat8_t a, mfloat8x8_t b) { + return vset_lane_mf8(a, b, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vsetq_lane_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[B]], i8 [[TMP0]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VSET_LANE]] +// +mfloat8x16_t test_vsetq_lane_mf8(mfloat8_t a, mfloat8x16_t b) { + return vsetq_lane_mf8(a, b, 15); +} + + +// CHECK-LABEL: define dso_local <1 x i8> @test_vget_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vget_lane_mf8(mfloat8x8_t a) { + return vget_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <1 x i8> @test_vdupb_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[A]], i32 7 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vdupb_lane_mf8(mfloat8x8_t a) { + return vdupb_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <1 x i8> @test_vgetq_lane_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vgetq_lane_mf8(mfloat8x16_t a) { + return vgetq_lane_mf8(a, 15); +} + +// CHECK-LABEL: define dso_local <1 x i8> @test_vdupb_laneq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[A]], i32 15 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: ret <1 x i8> [[TMP0]] +// +mfloat8_t test_vdupb_laneq_mf8(mfloat8x16_t a) { + return vdupb_laneq_mf8(a, 15); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcreate_mf8( +// CHECK-SAME: i64 noundef [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i64 [[A]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// +mfloat8x8_t test_vcreate_mf8(uint64_t a) { + return vcreate_mf8(a); +} + + +// CHECK-LABEL: define dso_local <8 x i8> @test_vdup_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// +mfloat8x8_t test_vdup_n_mf8(mfloat8_t a) { + return vdup_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vdupq_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[TMP2]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[TMP2]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[TMP2]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[TMP2]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[TMP2]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[TMP2]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[TMP2]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[TMP2]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// +mfloat8x16_t test_vdupq_n_mf8(mfloat8_t a) { + return vdupq_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vmov_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 +// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// +mfloat8x8_t test_vmov_n_mf8(mfloat8_t a) { + return vmov_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vmovq_n_mf8( +// CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 +// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i32 0 +// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 +// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 +// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 +// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 +// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 +// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 +// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 +// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[TMP2]], i32 8 +// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[TMP2]], i32 9 +// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[TMP2]], i32 10 +// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[TMP2]], i32 11 +// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[TMP2]], i32 12 +// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[TMP2]], i32 13 +// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[TMP2]], i32 14 +// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[TMP2]], i32 15 +// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// +mfloat8x16_t test_vmovq_n_mf8(mfloat8_t a) { + return vmovq_n_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcombine_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vcombine_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vcombine_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vget_high_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vget_high_mf8(mfloat8x16_t a) { + return vget_high_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vget_low_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vget_low_mf8(mfloat8x16_t a) { + return vget_low_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL11_I]] +// +mfloat8x8_t test_vtbl1_mf8(mfloat8x8_t a, uint8x8_t b) { + return vtbl1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl2_mf8( +// CHECK-SAME: [2 x <8 x i8>] alignstack(8) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL13_I]] +// +mfloat8x8_t test_vtbl2_mf8(mfloat8x8x2_t a, uint8x8_t b) { + return vtbl2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl3_mf8( +// CHECK-SAME: [3 x <8 x i8>] alignstack(8) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL26_I]] +// +mfloat8x8_t test_vtbl3_mf8(mfloat8x8x3_t a, uint8x8_t b) { + return vtbl3_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbl4_mf8( +// CHECK-SAME: [4 x <8 x i8>] alignstack(8) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_2_INSERT]], <8 x i8> [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL27_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL27_I]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL28_I]] +// +mfloat8x8_t test_vtbl4_mf8(mfloat8x8x4_t a, uint8x8_t b) { + return vtbl4_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL11_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[VTBL1_I]], <8 x i8> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], splat (i8 8) +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], splat (i8 -1) +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL11_I]] +// CHECK-NEXT: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <8 x i8> [[VTBX_I]] +// +mfloat8x8_t test_vtbx1_mf8(mfloat8x8_t a, mfloat8x8_t b, uint8x8_t c) { + return vtbx1_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [2 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <8 x i8>] poison, <8 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBX1_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBX13_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX1_I]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX13_I]] +// +mfloat8x8_t test_vtbx2_mf8(mfloat8x8_t a, mfloat8x8x2_t b, uint8x8_t c) { + return vtbx2_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx3_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [3 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <8 x i8>] poison, <8 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <8 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBL25_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> zeroinitializer, <16 x i32> +// CHECK-NEXT: [[VTBL26_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[VTBL2_I]], <16 x i8> [[VTBL25_I]], <8 x i8> [[C]]) +// CHECK-NEXT: [[TMP0:%.*]] = icmp uge <8 x i8> [[C]], splat (i8 24) +// CHECK-NEXT: [[TMP1:%.*]] = sext <8 x i1> [[TMP0]] to <8 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = and <8 x i8> [[TMP1]], [[A]] +// CHECK-NEXT: [[TMP3:%.*]] = xor <8 x i8> [[TMP1]], splat (i8 -1) +// CHECK-NEXT: [[TMP4:%.*]] = and <8 x i8> [[TMP3]], [[VTBL26_I]] +// CHECK-NEXT: [[VTBX_I:%.*]] = or <8 x i8> [[TMP2]], [[TMP4]] +// CHECK-NEXT: ret <8 x i8> [[VTBX_I]] +// +mfloat8x8_t test_vtbx3_mf8(mfloat8x8_t a, mfloat8x8x3_t b, uint8x8_t c) { + return vtbx3_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtbx4_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [4 x <8 x i8>] alignstack(8) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <8 x i8>] poison, <8 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_0_INSERT]], <8 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_1_INSERT]], <8 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <8 x i8>] [[DOTFCA_2_INSERT]], <8 x i8> [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <8 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBX2_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBX27_I:%.*]] = shufflevector <8 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <8 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i32> +// CHECK-NEXT: [[VTBX28_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[VTBX2_I]], <16 x i8> [[VTBX27_I]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX28_I]] +// +mfloat8x8_t test_vtbx4_mf8(mfloat8x8_t a, mfloat8x8x4_t b, uint8x8_t c) { + return vtbx4_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vext_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VEXT]] +// +mfloat8x8_t test_vext_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vext_mf8(a, b, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vextq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VEXT:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VEXT]] +// +mfloat8x16_t test_vextq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vextq_mf8(a, b, 7); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev64_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vrev64_mf8(mfloat8x8_t a) { + return vrev64_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev64q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vrev64q_mf8(mfloat8x16_t a) { + return vrev64q_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev32_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vrev32_mf8(mfloat8x8_t a) { + return vrev32_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev32q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vrev32q_mf8(mfloat8x16_t a) { + return vrev32q_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vrev16_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vrev16_mf8(mfloat8x8_t a) { + return vrev16_mf8(a); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vrev16q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vrev16q_mf8(mfloat8x16_t a) { + return vrev16q_mf8(a); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vbsl_mf8( +// CHECK-SAME: <8 x i8> noundef [[V1:%.*]], <8 x i8> [[V2:%.*]], <8 x i8> [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <8 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <8 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <8 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <8 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <8 x i8> [[VBSL2_I]] +// +mfloat8x8_t test_vbsl_mf8(uint8x8_t v1, mfloat8x8_t v2, mfloat8x8_t v3) { + return vbsl_mf8(v1, v2, v3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vbslq_mf8( +// CHECK-SAME: <16 x i8> noundef [[V1:%.*]], <16 x i8> [[V2:%.*]], <16 x i8> [[V3:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VBSL_I:%.*]] = and <16 x i8> [[V1]], [[V2]] +// CHECK-NEXT: [[TMP0:%.*]] = xor <16 x i8> [[V1]], splat (i8 -1) +// CHECK-NEXT: [[VBSL1_I:%.*]] = and <16 x i8> [[TMP0]], [[V3]] +// CHECK-NEXT: [[VBSL2_I:%.*]] = or <16 x i8> [[VBSL_I]], [[VBSL1_I]] +// CHECK-NEXT: ret <16 x i8> [[VBSL2_I]] +// +mfloat8x16_t test_vbslq_mf8(uint8x16_t v1, mfloat8x16_t v2, mfloat8x16_t v3) { + return vbslq_mf8(v1, v2, v3); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x8x2_t @test_vtrn_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T:%.*]] poison, <8 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x8x2_t test_vtrn_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vtrn_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x16x2_t @test_vtrnq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTRN_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VTRN1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T:%.*]] poison, <16 x i8> [[VTRN_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VTRN1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x16x2_t test_vtrnq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vtrnq_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x8x2_t @test_vzip_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T:%.*]] poison, <8 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x8x2_t test_vzip_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vzip_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x16x2_t @test_vzipq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VZIP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VZIP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T:%.*]] poison, <16 x i8> [[VZIP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VZIP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x16x2_t test_vzipq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vzipq_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x8x2_t @test_vuzp_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T:%.*]] poison, <8 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT1]], <8 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <8 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] poison, <8 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_0_INSERT]], <8 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X8X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x8x2_t test_vuzp_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vuzp_mf8(a, b); +} + +// CHECK-LABEL: define dso_local %struct.mfloat8x16x2_t @test_vuzpq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VUZP_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[VUZP1_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: [[DOTFCA_0_0_INSERT1:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T:%.*]] poison, <16 x i8> [[VUZP_I]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT2:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT1]], <16 x i8> [[VUZP1_I]], 0, 1 +// CHECK-NEXT: [[TMP0:%.*]] = extractvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT2]], 0 +// CHECK-NEXT: [[DOTFCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 0 +// CHECK-NEXT: [[DOTFCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[TMP0]], 1 +// CHECK-NEXT: [[DOTFCA_0_0_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] poison, <16 x i8> [[DOTFCA_0_EXTRACT]], 0, 0 +// CHECK-NEXT: [[DOTFCA_0_1_INSERT:%.*]] = insertvalue [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_0_INSERT]], <16 x i8> [[DOTFCA_1_EXTRACT]], 0, 1 +// CHECK-NEXT: ret [[STRUCT_MFLOAT8X16X2_T]] [[DOTFCA_0_1_INSERT]] +// +mfloat8x16x2_t test_vuzpq_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vuzpq_mf8(a, b); +} + +// CHECK-LABEL: define dso_local void @test_vcopy_lane_mf8( +// CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[ARG_I8X8]], i32 0 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopy_lane_mf8(mfloat8x8_t arg_i8x8) { + vcopy_lane_mf8(arg_i8x8, 0, arg_i8x8, 0); +} + +// CHECK-LABEL: define dso_local void @test_vcopyq_lane_mf8( +// CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]], <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[ARG_I8X8]], i32 0 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopyq_lane_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { + vcopyq_lane_mf8(arg_i8x16, 0, arg_i8x8, 0); +} + +// CHECK-LABEL: define dso_local void @test_vcopy_laneq_mf8( +// CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]], <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[ARG_I8X16]], i32 0 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopy_laneq_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { + vcopy_laneq_mf8(arg_i8x8, 0, arg_i8x16, 0); +} + +// CHECK-LABEL: define dso_local void @test_vcopyq_laneq_mf8( +// CHECK-SAME: <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[ARG_I8X16]], i32 0 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: ret void +// +void test_vcopyq_laneq_mf8(mfloat8x16_t arg_i8x16) { + vcopyq_laneq_mf8(arg_i8x16, 0, arg_i8x16, 0); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vdup_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// +mfloat8x8_t test_vdup_lane_mf8(mfloat8x8_t a) { + return vdup_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vdupq_lane_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// +mfloat8x16_t test_vdupq_lane_mf8(mfloat8x8_t a) { + return vdupq_lane_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vdup_laneq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[LANE]] +// +mfloat8x8_t test_vdup_laneq_mf8(mfloat8x16_t a) { + return vdup_laneq_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vdupq_laneq_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[LANE:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[A]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[LANE]] +// +mfloat8x16_t test_vdupq_laneq_mf8(mfloat8x16_t a) { + return vdupq_laneq_mf8(a, 7); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vtrn1_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vtrn1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vtrn1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vtrn1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vzip1_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vzip1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vzip1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vzip1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vuzp1_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vuzp1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vuzp1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vuzp1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vtrn2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vtrn2_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vtrn2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vtrn2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vtrn2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vtrn2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vzip2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vzip2_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vzip2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vzip2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vzip2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vzip2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vuzp2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <8 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> [[B]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE_I]] +// +mfloat8x8_t test_vuzp2_mf8(mfloat8x8_t a, mfloat8x8_t b) { + return vuzp2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vuzp2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE_I:%.*]] = shufflevector <16 x i8> [[A]], <16 x i8> [[B]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE_I]] +// +mfloat8x16_t test_vuzp2q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vuzp2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl1_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl1.v8i8(<16 x i8> [[A]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL1_I]] +// +mfloat8x8_t test_vqtbl1_mf8(mfloat8x16_t a, uint8x8_t b) { + return vqtbl1_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBL1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL1_I]] +// +mfloat8x16_t test_vqtbl1q_mf8(mfloat8x16_t a, mfloat8x16_t b) { + return vqtbl1q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl2_mf8( +// CHECK-SAME: [2 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl2.v8i8(<16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL2_I]] +// +mfloat8x8_t test_vqtbl2_mf8(mfloat8x16x2_t a, uint8x8_t b) { + return vqtbl2_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl2q_mf8( +// CHECK-SAME: [2 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBL2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl2.v16i8(<16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL2_I]] +// +mfloat8x16_t test_vqtbl2q_mf8(mfloat8x16x2_t a, mfloat8x16_t b) { + return vqtbl2q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl3q_mf8( +// CHECK-SAME: [3 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl3.v16i8(<16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL3_I]] +// +mfloat8x16_t test_vqtbl3q_mf8(mfloat8x16x3_t a, mfloat8x16_t b) { + return vqtbl3q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl3_mf8( +// CHECK-SAME: [3 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBL3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl3.v8i8(<16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL3_I]] +// +mfloat8x8_t test_vqtbl3_mf8(mfloat8x16x3_t a, uint8x8_t b) { + return vqtbl3_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbl4_mf8( +// CHECK-SAME: [4 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <8 x i8> noundef [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbl4.v8i8(<16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <8 x i8> [[B]]) +// CHECK-NEXT: ret <8 x i8> [[VTBL4_I]] +// +mfloat8x8_t test_vqtbl4_mf8(mfloat8x16x4_t a, uint8x8_t b) { + return vqtbl4_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbl4q_mf8( +// CHECK-SAME: [4 x <16 x i8>] alignstack(16) [[A_COERCE:%.*]], <16 x i8> [[B:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[A_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 0 +// CHECK-NEXT: [[A_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 1 +// CHECK-NEXT: [[A_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 2 +// CHECK-NEXT: [[A_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[A_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[A_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[A_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[A_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[A_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBL4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbl4.v16i8(<16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i8> [[B]]) +// CHECK-NEXT: ret <16 x i8> [[VTBL4_I]] +// +mfloat8x16_t test_vqtbl4q_mf8(mfloat8x16x4_t a, mfloat8x16_t b) { + return vqtbl4q_mf8(a, b); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx1_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx1.v8i8(<8 x i8> [[A]], <16 x i8> [[B]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX1_I]] +// +mfloat8x8_t test_vqtbx1_mf8(mfloat8x8_t a, mfloat8x16_t b, uint8x8_t c) { + return vqtbx1_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx1q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], <16 x i8> [[B:%.*]], <16 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VTBX1_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx1.v16i8(<16 x i8> [[A]], <16 x i8> [[B]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX1_I]] +// +mfloat8x16_t test_vqtbx1q_mf8(mfloat8x16_t a, mfloat8x16_t b, uint8x16_t c) { + return vqtbx1q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx2_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx2.v8i8(<8 x i8> [[A]], <16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX2_I]] +// +mfloat8x8_t test_vqtbx2_mf8(mfloat8x8_t a, mfloat8x16x2_t b, uint8x8_t c) { + return vqtbx2_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx2q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], [2 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [2 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [2 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [2 x <16 x i8>] [[DOTFCA_1_INSERT]], 1 +// CHECK-NEXT: [[VTBX2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx2.v16i8(<16 x i8> [[A]], <16 x i8> [[DOTFCA_1_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_1_INSERT_FCA_1_EXTRACT]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX2_I]] +// +mfloat8x16_t test_vqtbx2q_mf8(mfloat8x16_t a, mfloat8x16x2_t b, mfloat8x16_t c) { + return vqtbx2q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx3_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx3.v8i8(<8 x i8> [[A]], <16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX3_I]] +// +mfloat8x8_t test_vqtbx3_mf8(mfloat8x8_t a, mfloat8x16x3_t b, uint8x8_t c) { + return vqtbx3_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx3q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], [3 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [3 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [3 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [3 x <16 x i8>] [[DOTFCA_2_INSERT]], 2 +// CHECK-NEXT: [[VTBX3_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx3.v16i8(<16 x i8> [[A]], <16 x i8> [[DOTFCA_2_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_2_INSERT_FCA_2_EXTRACT]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX3_I]] +// +mfloat8x16_t test_vqtbx3q_mf8(mfloat8x16_t a, mfloat8x16x3_t b, mfloat8x16_t c) { + return vqtbx3q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vqtbx4_mf8( +// CHECK-SAME: <8 x i8> [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <8 x i8> noundef [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.tbx4.v8i8(<8 x i8> [[A]], <16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <8 x i8> [[C]]) +// CHECK-NEXT: ret <8 x i8> [[VTBX4_I]] +// +mfloat8x8_t test_vqtbx4_mf8(mfloat8x8_t a, mfloat8x16x4_t b, uint8x8_t c) { + return vqtbx4_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vqtbx4q_mf8( +// CHECK-SAME: <16 x i8> [[A:%.*]], [4 x <16 x i8>] alignstack(16) [[B_COERCE:%.*]], <16 x i8> [[C:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[B_COERCE_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 0 +// CHECK-NEXT: [[B_COERCE_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 1 +// CHECK-NEXT: [[B_COERCE_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 2 +// CHECK-NEXT: [[B_COERCE_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[B_COERCE]], 3 +// CHECK-NEXT: [[DOTFCA_0_INSERT:%.*]] = insertvalue [4 x <16 x i8>] poison, <16 x i8> [[B_COERCE_FCA_0_EXTRACT]], 0 +// CHECK-NEXT: [[DOTFCA_1_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_0_INSERT]], <16 x i8> [[B_COERCE_FCA_1_EXTRACT]], 1 +// CHECK-NEXT: [[DOTFCA_2_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_1_INSERT]], <16 x i8> [[B_COERCE_FCA_2_EXTRACT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT:%.*]] = insertvalue [4 x <16 x i8>] [[DOTFCA_2_INSERT]], <16 x i8> [[B_COERCE_FCA_3_EXTRACT]], 3 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_0_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 0 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_1_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 1 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_2_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 2 +// CHECK-NEXT: [[DOTFCA_3_INSERT_FCA_3_EXTRACT:%.*]] = extractvalue [4 x <16 x i8>] [[DOTFCA_3_INSERT]], 3 +// CHECK-NEXT: [[VTBX4_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.tbx4.v16i8(<16 x i8> [[A]], <16 x i8> [[DOTFCA_3_INSERT_FCA_0_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_1_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_2_EXTRACT]], <16 x i8> [[DOTFCA_3_INSERT_FCA_3_EXTRACT]], <16 x i8> [[C]]) +// CHECK-NEXT: ret <16 x i8> [[VTBX4_I]] +// +mfloat8x16_t test_vqtbx4q_mf8(mfloat8x16_t a, mfloat8x16x4_t b, mfloat8x16_t c) { + return vqtbx4q_mf8(a, b, c); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_lane_mf8( +// CHECK-SAME: <8 x i8> [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v8i8(<8 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +mfloat8x16_t test_vluti2_lane_mf8(mfloat8x8_t vn, uint8x8_t vm) { + return vluti2_lane_mf8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_lane_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANE:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.lane.v16i8.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANE]] +// +mfloat8x16_t test_vluti2q_lane_mf8(mfloat8x16_t vn, uint8x8_t vm) { + return vluti2q_lane_mf8(vn, vm, 1); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2_laneq_mf8( +// CHECK-SAME: <8 x i8> [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v8i8(<8 x i8> [[VN]], <16 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +mfloat8x16_t test_vluti2_laneq_mf8(mfloat8x8_t vn, uint8x16_t vm) { + return vluti2_laneq_mf8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti2q_laneq_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI2_LANEQ:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti2.laneq.v16i8.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <16 x i8> [[VLUTI2_LANEQ]] +// +mfloat8x16_t test_vluti2q_laneq_mf8(mfloat8x16_t vn, uint8x16_t vm) { + return vluti2q_laneq_mf8(vn, vm, 3); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_lane_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <8 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI4Q_LANE:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti4q.lane.v16i8(<16 x i8> [[VN]], <8 x i8> [[VM]], i32 0) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANE]] +// +mfloat8x16_t test_vluti4q_lane_mf8(mfloat8x16_t vn, uint8x8_t vm) { + return vluti4q_lane_mf8(vn, vm, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vluti4q_laneq_mf8( +// CHECK-SAME: <16 x i8> [[VN:%.*]], <16 x i8> noundef [[VM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[VLUTI4Q_LANEQ:%.*]] = call <16 x i8> @llvm.aarch64.neon.vluti4q.laneq.v16i8(<16 x i8> [[VN]], <16 x i8> [[VM]], i32 1) +// CHECK-NEXT: ret <16 x i8> [[VLUTI4Q_LANEQ]] +// +mfloat8x16_t test_vluti4q_laneq_mf8(mfloat8x16_t vn, uint8x16_t vm) { + return vluti4q_laneq_mf8(vn, vm, 1); +} From 2e53a8618b8e73a7a523ed12ba9286257b002c3b Mon Sep 17 00:00:00 2001 From: Marian Lukac Date: Tue, 29 Apr 2025 16:18:17 +0000 Subject: [PATCH 2/5] [NeonEmitter] Update ArchGuard for MFloat8 type to ensure correct architecture checks on AArch64 --- clang/include/clang/Basic/arm_neon.td | 162 ++++++++------------------ clang/utils/TableGen/NeonEmitter.cpp | 12 ++ 2 files changed, 60 insertions(+), 114 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 90f0e90e4a7f8..7251cc2d1759a 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -279,10 +279,10 @@ def OP_CVT_F32_BF16 // Splat operation - performs a range-checked splat over a vector def SPLAT : WInst<"splat_lane", ".(!q)I", - "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPlmQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; def SPLATQ : WInst<"splat_laneq", ".(!Q)I", - "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPl", + "UcUsUicsilPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlhdQhQdPlQPlmQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; let TargetGuard = "bf16,neon" in { @@ -547,19 +547,19 @@ def VST4_LANE_F16 : WInst<"vst4_lane", "v*(4!)I", "hQh", // E.3.16 Extract lanes from a vector let InstName = "vmov" in def VGET_LANE : IInst<"vget_lane", "1.I", - "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl", + "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlmQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; //////////////////////////////////////////////////////////////////////////////// // E.3.17 Set lanes within a vector let InstName = "vmov" in def VSET_LANE : IInst<"vset_lane", ".1.I", - "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUl", + "UcUsUicsiPcPsfQUcQUsQUiQcQsQiQPcQPsQflUlQlQUlmQm", [ImmCheck<2, ImmCheckLaneIndex, 1>]>; //////////////////////////////////////////////////////////////////////////////// // E.3.18 Initialize a vector from bit pattern -def VCREATE : NoTestOpInst<"vcreate", ".(IU>)", "csihfUcUsUiUlPcPsl", OP_CAST> { +def VCREATE : NoTestOpInst<"vcreate", ".(IU>)", "csihfUcUsUiUlPcPslm", OP_CAST> { let BigEndianSafe = 1; } @@ -567,20 +567,20 @@ def VCREATE : NoTestOpInst<"vcreate", ".(IU>)", "csihfUcUsUiUlPcPsl", OP_CAST> { // E.3.19 Set all lanes to same value let InstName = "vmov" in { def VDUP_N : WOpInst<"vdup_n", ".1", - "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl", + "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUlmQm", OP_DUP>; def VMOV_N : WOpInst<"vmov_n", ".1", - "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl", + "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUlmQm", OP_DUP>; } let InstName = "" in def VDUP_LANE: WOpInst<"vdup_lane", ".qI", - "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUl", + "UcUsUicsiPcPshfQUcQUsQUiQcQsQiQPcQPsQhQflUlQlQUlmQm", OP_DUP_LN>; //////////////////////////////////////////////////////////////////////////////// // E.3.20 Combining vectors -def VCOMBINE : NoTestOpInst<"vcombine", "Q..", "csilhfUcUsUiUlPcPs", OP_CONC>; +def VCOMBINE : NoTestOpInst<"vcombine", "Q..", "csilhfUcUsUiUlPcPsm", OP_CONC>; //////////////////////////////////////////////////////////////////////////////// // E.3.21 Splitting vectors @@ -589,8 +589,8 @@ def VCOMBINE : NoTestOpInst<"vcombine", "Q..", "csilhfUcUsUiUlPcPs", OP_CONC>; // versions of these intrinsics in both AArch32 and AArch64 architectures. See // D45668 for more details. let InstName = "vmov" in { -def VGET_HIGH : NoTestOpInst<"vget_high", ".Q", "csilhfUcUsUiUlPcPs", OP_HI>; -def VGET_LOW : NoTestOpInst<"vget_low", ".Q", "csilhfUcUsUiUlPcPs", OP_LO>; +def VGET_HIGH : NoTestOpInst<"vget_high", ".Q", "csilhfUcUsUiUlPcPsm", OP_HI>; +def VGET_LOW : NoTestOpInst<"vget_low", ".Q", "csilhfUcUsUiUlPcPsm", OP_LO>; } //////////////////////////////////////////////////////////////////////////////// @@ -619,16 +619,16 @@ def VQMOVUN : SInst<"vqmovun", "(; //////////////////////////////////////////////////////////////////////////////// // E.3.23-24 Table lookup, Extended table lookup let InstName = "vtbl" in { -def VTBL1 : WInst<"vtbl1", "..p", "UccPc">; -def VTBL2 : WInst<"vtbl2", ".2p", "UccPc">; -def VTBL3 : WInst<"vtbl3", ".3p", "UccPc">; -def VTBL4 : WInst<"vtbl4", ".4p", "UccPc">; +def VTBL1 : WInst<"vtbl1", "..p", "UccPcm">; +def VTBL2 : WInst<"vtbl2", ".2p", "UccPcm">; +def VTBL3 : WInst<"vtbl3", ".3p", "UccPcm">; +def VTBL4 : WInst<"vtbl4", ".4p", "UccPcm">; } let InstName = "vtbx" in { -def VTBX1 : WInst<"vtbx1", "...p", "UccPc">; -def VTBX2 : WInst<"vtbx2", "..2p", "UccPc">; -def VTBX3 : WInst<"vtbx3", "..3p", "UccPc">; -def VTBX4 : WInst<"vtbx4", "..4p", "UccPc">; +def VTBX1 : WInst<"vtbx1", "...p", "UccPcm">; +def VTBX2 : WInst<"vtbx2", "..2p", "UccPcm">; +def VTBX3 : WInst<"vtbx3", "..3p", "UccPcm">; +def VTBX4 : WInst<"vtbx4", "..4p", "UccPcm">; } //////////////////////////////////////////////////////////////////////////////// @@ -677,15 +677,15 @@ def VQDMLSL_N : SOpInst<"vqdmlsl_n", "(>Q)(>Q).1", "si", OP_QDMLSL_N>; //////////////////////////////////////////////////////////////////////////////// // E.3.26 Vector Extract def VEXT : WInst<"vext", "...I", - "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQf", + "cUcPcsUsPsiUilUlfQcQUcQPcQsQUsQPsQiQUiQlQUlQfmQm", [ImmCheck<2, ImmCheckLaneIndex, 0>]>; //////////////////////////////////////////////////////////////////////////////// // E.3.27 Reverse vector elements -def VREV64 : WOpInst<"vrev64", "..", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQf", +def VREV64 : WOpInst<"vrev64", "..", "csiUcUsUiPcPsfQcQsQiQUcQUsQUiQPcQPsQfmQm", OP_REV64>; -def VREV32 : WOpInst<"vrev32", "..", "csUcUsPcPsQcQsQUcQUsQPcQPs", OP_REV32>; -def VREV16 : WOpInst<"vrev16", "..", "cUcPcQcQUcQPc", OP_REV16>; +def VREV32 : WOpInst<"vrev32", "..", "csUcUsPcPsQcQsQUcQUsQPcQPsmQm", OP_REV32>; +def VREV16 : WOpInst<"vrev16", "..", "cUcPcQcQUcQPcmQm", OP_REV16>; //////////////////////////////////////////////////////////////////////////////// // E.3.28 Other single operand arithmetic @@ -709,13 +709,13 @@ def VBIC : LOpInst<"vbic", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ANDN>; def VORN : LOpInst<"vorn", "...", "csilUcUsUiUlQcQsQiQlQUcQUsQUiQUl", OP_ORN>; let isHiddenLInst = 1 in def VBSL : SInst<"vbsl", ".U..", - "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPs">; + "csilUcUsUiUlfPcPsQcQsQiQlQUcQUsQUiQUlQfQPcQPsmQm">; //////////////////////////////////////////////////////////////////////////////// // E.3.30 Transposition operations -def VTRN : WInst<"vtrn", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">; -def VZIP : WInst<"vzip", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">; -def VUZP : WInst<"vuzp", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPs">; +def VTRN : WInst<"vtrn", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPsmQm">; +def VZIP : WInst<"vzip", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPsmQm">; +def VUZP : WInst<"vuzp", "2..", "csiUcUsUifPcPsQcQsQiQUcQUsQUiQfQPcQPsmQm">; //////////////////////////////////////////////////////////////////////////////// @@ -1028,19 +1028,19 @@ def GET_LANE : IInst<"vget_lane", "1.I", "dQdPlQPl", def SET_LANE : IInst<"vset_lane", ".1.I", "dQdPlQPl", [ImmCheck<2, ImmCheckLaneIndex, 1>]>; def COPY_LANE : IOpInst<"vcopy_lane", "..I.I", - "csilUcUsUiUlPcPsPlfd", OP_COPY_LN>; + "csilUcUsUiUlPcPsPlfdm", OP_COPY_LN>; def COPYQ_LANE : IOpInst<"vcopy_lane", "..IqI", - "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; + "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPlQm", OP_COPY_LN>; def COPY_LANEQ : IOpInst<"vcopy_laneq", "..IQI", - "csilPcPsPlUcUsUiUlfd", OP_COPY_LN>; + "csilPcPsPlUcUsUiUlfdm", OP_COPY_LN>; def COPYQ_LANEQ : IOpInst<"vcopy_laneq", "..I.I", - "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPl", OP_COPY_LN>; + "QcQsQiQlQUcQUsQUiQUlQPcQPsQfQdQPlQm", OP_COPY_LN>; //////////////////////////////////////////////////////////////////////////////// // Set all lanes to same value def VDUP_LANE1: WOpInst<"vdup_lane", ".qI", "dQdPlQPl", OP_DUP_LN>; def VDUP_LANE2: WOpInst<"vdup_laneq", ".QI", - "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPl", + "csilUcUsUiUlPcPshfdQcQsQiQlQPcQPsQUcQUsQUiQUlQhQfQdPlQPlmQm", OP_DUP_LN>; def DUP_N : WOpInst<"vdup_n", ".1", "dQdPlQPl", OP_DUP>; def MOV_N : WOpInst<"vmov_n", ".1", "dQdPlQPl", OP_DUP>; @@ -1266,31 +1266,31 @@ def FMINNM_S64 : SInst<"vminnm", "...", "dQd">; //////////////////////////////////////////////////////////////////////////////// // Permutation def VTRN1 : SOpInst<"vtrn1", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_TRN1>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_TRN1>; def VZIP1 : SOpInst<"vzip1", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_ZIP1>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_ZIP1>; def VUZP1 : SOpInst<"vuzp1", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_UZP1>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_UZP1>; def VTRN2 : SOpInst<"vtrn2", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_TRN2>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_TRN2>; def VZIP2 : SOpInst<"vzip2", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_ZIP2>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_ZIP2>; def VUZP2 : SOpInst<"vuzp2", "...", - "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPl", OP_UZP2>; + "csiUcUsUifPcPsQcQsQiQlQUcQUsQUiQUlQfQdQPcQPsQPlmQm", OP_UZP2>; //////////////////////////////////////////////////////////////////////////////// // Table lookup let InstName = "vtbl" in { -def VQTBL1_A64 : WInst<"vqtbl1", ".QU", "UccPcQUcQcQPc">; -def VQTBL2_A64 : WInst<"vqtbl2", ".(2Q)U", "UccPcQUcQcQPc">; -def VQTBL3_A64 : WInst<"vqtbl3", ".(3Q)U", "UccPcQUcQcQPc">; -def VQTBL4_A64 : WInst<"vqtbl4", ".(4Q)U", "UccPcQUcQcQPc">; +def VQTBL1_A64 : WInst<"vqtbl1", ".QU", "UccPcQUcQcQPcmQm">; +def VQTBL2_A64 : WInst<"vqtbl2", ".(2Q)U", "UccPcQUcQcQPcmQm">; +def VQTBL3_A64 : WInst<"vqtbl3", ".(3Q)U", "UccPcQUcQcQPcmQm">; +def VQTBL4_A64 : WInst<"vqtbl4", ".(4Q)U", "UccPcQUcQcQPcmQm">; } let InstName = "vtbx" in { -def VQTBX1_A64 : WInst<"vqtbx1", "..QU", "UccPcQUcQcQPc">; -def VQTBX2_A64 : WInst<"vqtbx2", "..(2Q)U", "UccPcQUcQcQPc">; -def VQTBX3_A64 : WInst<"vqtbx3", "..(3Q)U", "UccPcQUcQcQPc">; -def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPc">; +def VQTBX1_A64 : WInst<"vqtbx1", "..QU", "UccPcQUcQcQPcmQm">; +def VQTBX2_A64 : WInst<"vqtbx2", "..(2Q)U", "UccPcQUcQcQPcmQm">; +def VQTBX3_A64 : WInst<"vqtbx3", "..(3Q)U", "UccPcQUcQcQPcmQm">; +def VQTBX4_A64 : WInst<"vqtbx4", "..(4Q)U", "UccPcQUcQcQPcmQm">; } //////////////////////////////////////////////////////////////////////////////// @@ -1654,9 +1654,9 @@ def SCALAR_SQRDMLSH_LANE : SOpInst<"vqrdmlsh_lane", "111.I", "SsSi", OP_SCALAR_Q def SCALAR_SQRDMLSH_LANEQ : SOpInst<"vqrdmlsh_laneq", "111QI", "SsSi", OP_SCALAR_QRDMLSH_LN>; } // TargetGuard = "v8.1a" -def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs", +def SCALAR_VDUP_LANE : IInst<"vdup_lane", "1.I", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPsSm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; -def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPs", +def SCALAR_VDUP_LANEQ : IInst<"vdup_laneq", "1QI", "ScSsSiSlSfSdSUcSUsSUiSUlSPcSPsSm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; } // ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)" @@ -2194,70 +2194,4 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { // fscale def FSCALE_V128 : WInst<"vscale", "..(.S)", "QdQfQh">; def FSCALE_V64 : WInst<"vscale", "(.q)(.q)(.qS)", "fh">; -} - -//FP8 versions of untyped intrinsics -let ArchGuard = "defined(__aarch64__)" in { - def VGET_LANE_MF8 : IInst<"vget_lane", "1.I", "mQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; - def SPLAT_MF8 : WInst<"splat_lane", ".(!q)I", "mQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; - def SPLATQ_MF8 : WInst<"splat_laneq", ".(!Q)I", "mQm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; - def VSET_LANE_MF8 : IInst<"vset_lane", ".1.I", "mQm", [ImmCheck<2, ImmCheckLaneIndex, 1>]>; - def VCREATE_MF8 : NoTestOpInst<"vcreate", ".(IU>)", "m", OP_CAST> { let BigEndianSafe = 1; } - let InstName = "vmov" in { - def VDUP_N_MF8 : WOpInst<"vdup_n", ".1", "mQm", OP_DUP>; - def VMOV_N_MF8 : WOpInst<"vmov_n", ".1", "mQm", OP_DUP>; - } - let InstName = "" in - def VDUP_LANE_MF8: WOpInst<"vdup_lane", ".qI", "mQm", OP_DUP_LN>; - def VCOMBINE_MF8 : NoTestOpInst<"vcombine", "Q..", "m", OP_CONC>; - let InstName = "vmov" in { - def VGET_HIGH_MF8 : NoTestOpInst<"vget_high", ".Q", "m", OP_HI>; - def VGET_LOW_MF8 : NoTestOpInst<"vget_low", ".Q", "m", OP_LO>; - } - let InstName = "vtbl" in { - def VTBL1_MF8 : WInst<"vtbl1", "..p", "m">; - def VTBL2_MF8 : WInst<"vtbl2", ".2p", "m">; - def VTBL3_MF8 : WInst<"vtbl3", ".3p", "m">; - def VTBL4_MF8 : WInst<"vtbl4", ".4p", "m">; - } - let InstName = "vtbx" in { - def VTBX1_MF8 : WInst<"vtbx1", "...p", "m">; - def VTBX2_MF8 : WInst<"vtbx2", "..2p", "m">; - def VTBX3_MF8 : WInst<"vtbx3", "..3p", "m">; - def VTBX4_MF8 : WInst<"vtbx4", "..4p", "m">; - } - def VEXT_MF8 : WInst<"vext", "...I", "mQm", [ImmCheck<2, ImmCheckLaneIndex, 0>]>; - def VREV64_MF8 : WOpInst<"vrev64", "..", "mQm", OP_REV64>; - def VREV32_MF8 : WOpInst<"vrev32", "..", "mQm", OP_REV32>; - def VREV16_MF8 : WOpInst<"vrev16", "..", "mQm", OP_REV16>; - let isHiddenLInst = 1 in - def VBSL_MF8 : SInst<"vbsl", ".U..", "mQm">; - def VTRN_MF8 : WInst<"vtrn", "2..", "mQm">; - def VZIP_MF8 : WInst<"vzip", "2..", "mQm">; - def VUZP_MF8 : WInst<"vuzp", "2..", "mQm">; - def COPY_LANE_MF8 : IOpInst<"vcopy_lane", "..I.I", "m", OP_COPY_LN>; - def COPYQ_LANE_MF8 : IOpInst<"vcopy_lane", "..IqI", "Qm", OP_COPY_LN>; - def COPY_LANEQ_MF8 : IOpInst<"vcopy_laneq", "..IQI", "m", OP_COPY_LN>; - def COPYQ_LANEQ_MF8 : IOpInst<"vcopy_laneq", "..I.I", "Qm", OP_COPY_LN>; - def VDUP_LANE2_MF8 : WOpInst<"vdup_laneq", ".QI", "mQm", OP_DUP_LN>; - def VTRN1_MF8 : SOpInst<"vtrn1", "...", "mQm", OP_TRN1>; - def VZIP1_MF8 : SOpInst<"vzip1", "...", "mQm", OP_ZIP1>; - def VUZP1_MF8 : SOpInst<"vuzp1", "...", "mQm", OP_UZP1>; - def VTRN2_MF8 : SOpInst<"vtrn2", "...", "mQm", OP_TRN2>; - def VZIP2_MF8 : SOpInst<"vzip2", "...", "mQm", OP_ZIP2>; - def VUZP2_MF8 : SOpInst<"vuzp2", "...", "mQm", OP_UZP2>; - let InstName = "vtbl" in { - def VQTBL1_A64_MF8 : WInst<"vqtbl1", ".QU", "mQm">; - def VQTBL2_A64_MF8 : WInst<"vqtbl2", ".(2Q)U", "mQm">; - def VQTBL3_A64_MF8 : WInst<"vqtbl3", ".(3Q)U", "mQm">; - def VQTBL4_A64_MF8 : WInst<"vqtbl4", ".(4Q)U", "mQm">; - } - let InstName = "vtbx" in { - def VQTBX1_A64_MF8 : WInst<"vqtbx1", "..QU", "mQm">; - def VQTBX2_A64_MF8 : WInst<"vqtbx2", "..(2Q)U", "mQm">; - def VQTBX3_A64_MF8 : WInst<"vqtbx3", "..(3Q)U", "mQm">; - def VQTBX4_A64_MF8 : WInst<"vqtbx4", "..(4Q)U", "mQm">; - } - def SCALAR_VDUP_LANE_MF8 : IInst<"vdup_lane", "1.I", "Sm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; - def SCALAR_VDUP_LANEQ_MF8 : IInst<"vdup_laneq", "1QI", "Sm", [ImmCheck<1, ImmCheckLaneIndex, 0>]>; } \ No newline at end of file diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 6b0fa1648e583..c3dbef385e778 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -2056,9 +2056,21 @@ void NeonEmitter::createIntrinsic(const Record *R, auto &Entry = IntrinsicMap[Name]; for (auto &I : NewTypeSpecs) { + + // MFloat8 type is only available on AArch64. If encountered set ArchGuard + // correctly. + std::string savedArchGuard = ArchGuard; + if (Type(I.first, ".").isMFloat8()) { + if (ArchGuard.empty()) { + ArchGuard = "defined(__aarch64__)"; + } else if (ArchGuard.find("defined(__aarch64__)") == std::string::npos) { + ArchGuard = "defined(__aarch64__) && (" + savedArchGuard + ")"; + } + } Entry.emplace_back(R, Name, Proto, I.first, I.second, CK, Body, *this, ArchGuard, TargetGuard, IsUnavailable, BigEndianSafe); Out.push_back(&Entry.back()); + ArchGuard = savedArchGuard; } CurrentRecord = nullptr; From 842f1971e54cdbd79b10aa5127ba2fd0f8c1b8a4 Mon Sep 17 00:00:00 2001 From: Marian Lukac Date: Tue, 6 May 2025 13:41:34 +0000 Subject: [PATCH 3/5] Chnage format of Mfloat8 type in memory back to <1xi8> Change-Id: I6c4d9d98fbe46fb3ee115532a9432709c6a86e10 --- clang/lib/CodeGen/CGCall.cpp | 9 - clang/lib/CodeGen/CodeGenTypes.cpp | 3 - clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 31 ++- clang/test/CodeGen/AArch64/fp8-init-list.c | 14 +- .../fp8-intrinsics/acle_neon_fp8_untyped.c | 180 +++++++++++------- .../fp8-intrinsics/acle_sve2_fp8_fdot.c | 8 +- .../fp8-intrinsics/acle_sve2_fp8_fmla.c | 24 +-- clang/test/CodeGen/arm-mfp8.c | 44 ++--- clang/utils/TableGen/NeonEmitter.cpp | 14 +- 9 files changed, 184 insertions(+), 143 deletions(-) diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp index e5b2230566034..c7fbbbc6fd40d 100644 --- a/clang/lib/CodeGen/CGCall.cpp +++ b/clang/lib/CodeGen/CGCall.cpp @@ -5484,15 +5484,6 @@ RValue CodeGenFunction::EmitCall(const CGFunctionInfo &CallInfo, Builder.CreateStore(errorValue, swiftErrorTemp); } - // Mfloat8 type is loaded as scalar type, but is treated as single - // vector type for other operations. We need to bitcast it to the vector - // type here. - if (auto *EltTy = - dyn_cast(ArgInfo.getCoerceToType()); - EltTy && EltTy->getNumElements() == 1 && - V->getType() == EltTy->getScalarType()) - V = Builder.CreateBitCast(V, EltTy); - // We might have to widen integers, but we should never truncate. if (ArgInfo.getCoerceToType() != V->getType() && V->getType()->isIntegerTy()) diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index d1b292f23c2d2..843733ba6604d 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -108,9 +108,6 @@ llvm::Type *CodeGenTypes::ConvertTypeForMem(QualType T) { MT->getNumRows() * MT->getNumColumns()); } - if (T->isMFloat8Type()) - return llvm::Type::getInt8Ty(getLLVMContext()); - llvm::Type *R = ConvertType(T); // Check for the boolean vector case. diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index c0454e7bcc661..a38436611a1ed 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -4183,9 +4183,21 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, unsigned IntrinsicID, bool IsZExtReturn) { QualType LangPTy = E->getArg(1)->getType(); - llvm::Type *MemEltTy = CGM.getTypes().ConvertTypeForMem( + llvm::Type *MemEltTy = CGM.getTypes().ConvertType( LangPTy->castAs()->getPointeeType()); + // Mfloat8 types is stored as a vector, so extra work + // to extract sclar element type is necessary. + if (MemEltTy->isVectorTy()) { + #ifndef NDEBUG + auto *VecTy = cast(MemEltTy); + ElementCount EC = VecTy->getElementCount(); + assert(EC.isScalar() && VecTy->getElementType() == Int8Ty && + "Only <1 x i8> expected"); + #endif + MemEltTy = cast(MemEltTy)->getElementType(); + } + // The vector type that is returned may be different from the // eventual type loaded from memory. auto VectorTy = cast(ReturnTy); @@ -4230,9 +4242,21 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, SmallVectorImpl &Ops, unsigned IntrinsicID) { QualType LangPTy = E->getArg(1)->getType(); - llvm::Type *MemEltTy = CGM.getTypes().ConvertTypeForMem( + llvm::Type *MemEltTy = CGM.getTypes().ConvertType( LangPTy->castAs()->getPointeeType()); + // Mfloat8 types is stored as a vector, so extra work + // to extract sclar element type is necessary. + if (MemEltTy->isVectorTy()) { + #ifndef NDEBUG + auto *VecTy = cast(MemEltTy); + ElementCount EC = VecTy->getElementCount(); + assert(EC.isScalar() && VecTy->getElementType() == Int8Ty && + "Only <1 x i8> expected"); + #endif + MemEltTy = cast(MemEltTy)->getElementType(); + } + // The vector type that is stored may be different from the // eventual type stored to memory. auto VectorTy = cast(Ops.back()->getType()); @@ -6169,6 +6193,9 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vset_lane_mf8: case NEON::BI__builtin_neon_vsetq_lane_mf8: Ops.push_back(EmitScalarExpr(E->getArg(2))); + // The input vector type needs a cast to scalar type. + Ops[0] = + Builder.CreateBitCast(Ops[0], llvm::Type::getInt8Ty(getLLVMContext())); return Builder.CreateInsertElement(Ops[1], Ops[0], Ops[2], "vset_lane"); case NEON::BI__builtin_neon_vsetq_lane_f64: // The vector type needs a cast for the v2f64 variant. diff --git a/clang/test/CodeGen/AArch64/fp8-init-list.c b/clang/test/CodeGen/AArch64/fp8-init-list.c index 872ee4f8a3d42..8b4b31a71c46a 100644 --- a/clang/test/CodeGen/AArch64/fp8-init-list.c +++ b/clang/test/CodeGen/AArch64/fp8-init-list.c @@ -12,14 +12,14 @@ // CHECK-LABEL: define dso_local <8 x i8> @vector_init_test( // CHECK-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[VECINIT7:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer -// CHECK-NEXT: ret <8 x i8> [[VECINIT7]] +// CHECK-NEXT: [[VECINIT14:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer +// CHECK-NEXT: ret <8 x i8> [[VECINIT14]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z16vector_init_testu6__mfp8( // CHECK-CXX-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[VECINIT7:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer -// CHECK-CXX-NEXT: ret <8 x i8> [[VECINIT7]] +// CHECK-CXX-NEXT: [[VECINIT14:%.*]] = shufflevector <1 x i8> [[X]], <1 x i8> poison, <8 x i32> zeroinitializer +// CHECK-CXX-NEXT: ret <8 x i8> [[VECINIT14]] // mfloat8x8_t vector_init_test(__mfp8 x) { return (mfloat8x8_t) {x, x, x, x, x, x, x, x}; @@ -34,15 +34,13 @@ struct S s; // CHECK-LABEL: define dso_local void @f( // CHECK-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[X]], i64 0 -// CHECK-NEXT: store i8 [[TMP0]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] // CHECK-NEXT: ret void // // CHECK-CXX-LABEL: define dso_local void @_Z1fu6__mfp8( // CHECK-CXX-SAME: <1 x i8> [[X:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[X]], i64 0 -// CHECK-CXX-NEXT: store i8 [[TMP0]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] +// CHECK-CXX-NEXT: store <1 x i8> [[X]], ptr @s, align 1, !tbaa [[TBAA2:![0-9]+]] // CHECK-CXX-NEXT: ret void // void f(__mfp8 x) { diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c index fec1a93bdd5e9..fdc861836baf7 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_untyped.c @@ -86,18 +86,23 @@ mfloat8x8_t test_vcreate_mf8(uint64_t a) { // CHECK-LABEL: define dso_local <8 x i8> @test_vdup_n_mf8( // CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 -// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <8 x i8> [[VEXT_I]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> [[VEXT1_I]], <8 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <8 x i8> [[VECINIT2_I]], <8 x i8> [[VEXT3_I]], <8 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <8 x i8> [[VECINIT4_I]], <8 x i8> [[VEXT5_I]], <8 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT6_I]], <8 x i8> [[VEXT7_I]], <8 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <8 x i8> [[VECINIT8_I]], <8 x i8> [[VEXT9_I]], <8 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <8 x i8> [[VECINIT10_I]], <8 x i8> [[VEXT11_I]], <8 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <8 x i8> [[VECINIT12_I]], <8 x i8> [[VEXT13_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VECINIT14_I]] // mfloat8x8_t test_vdup_n_mf8(mfloat8_t a) { return vdup_n_mf8(a); @@ -106,26 +111,39 @@ mfloat8x8_t test_vdup_n_mf8(mfloat8_t a) { // CHECK-LABEL: define dso_local <16 x i8> @test_vdupq_n_mf8( // CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 -// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[TMP2]], i32 8 -// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[TMP2]], i32 9 -// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[TMP2]], i32 10 -// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[TMP2]], i32 11 -// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[TMP2]], i32 12 -// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[TMP2]], i32 13 -// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[TMP2]], i32 14 -// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[TMP2]], i32 15 -// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <16 x i8> [[VEXT_I]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> [[VEXT1_I]], <16 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <16 x i8> [[VECINIT2_I]], <16 x i8> [[VEXT3_I]], <16 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <16 x i8> [[VECINIT4_I]], <16 x i8> [[VEXT5_I]], <16 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <16 x i8> [[VECINIT6_I]], <16 x i8> [[VEXT7_I]], <16 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <16 x i8> [[VECINIT8_I]], <16 x i8> [[VEXT9_I]], <16 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <16 x i8> [[VECINIT10_I]], <16 x i8> [[VEXT11_I]], <16 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[VECINIT12_I]], <16 x i8> [[VEXT13_I]], <16 x i32> +// CHECK-NEXT: [[VEXT15_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT16_I:%.*]] = shufflevector <16 x i8> [[VECINIT14_I]], <16 x i8> [[VEXT15_I]], <16 x i32> +// CHECK-NEXT: [[VEXT17_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT18_I:%.*]] = shufflevector <16 x i8> [[VECINIT16_I]], <16 x i8> [[VEXT17_I]], <16 x i32> +// CHECK-NEXT: [[VEXT19_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT20_I:%.*]] = shufflevector <16 x i8> [[VECINIT18_I]], <16 x i8> [[VEXT19_I]], <16 x i32> +// CHECK-NEXT: [[VEXT21_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT22_I:%.*]] = shufflevector <16 x i8> [[VECINIT20_I]], <16 x i8> [[VEXT21_I]], <16 x i32> +// CHECK-NEXT: [[VEXT23_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT24_I:%.*]] = shufflevector <16 x i8> [[VECINIT22_I]], <16 x i8> [[VEXT23_I]], <16 x i32> +// CHECK-NEXT: [[VEXT25_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT26_I:%.*]] = shufflevector <16 x i8> [[VECINIT24_I]], <16 x i8> [[VEXT25_I]], <16 x i32> +// CHECK-NEXT: [[VEXT27_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT28_I:%.*]] = shufflevector <16 x i8> [[VECINIT26_I]], <16 x i8> [[VEXT27_I]], <16 x i32> +// CHECK-NEXT: [[VEXT29_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT30_I:%.*]] = shufflevector <16 x i8> [[VECINIT28_I]], <16 x i8> [[VEXT29_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VECINIT30_I]] // mfloat8x16_t test_vdupq_n_mf8(mfloat8_t a) { return vdupq_n_mf8(a); @@ -134,18 +152,23 @@ mfloat8x16_t test_vdupq_n_mf8(mfloat8_t a) { // CHECK-LABEL: define dso_local <8 x i8> @test_vmov_n_mf8( // CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <8 x i8> poison, i8 [[TMP2]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <8 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <8 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <8 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <8 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <8 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <8 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <8 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 -// CHECK-NEXT: ret <8 x i8> [[VECINIT7_I]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <8 x i8> [[VEXT_I]], <8 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <8 x i8> [[VECINIT_I]], <8 x i8> [[VEXT1_I]], <8 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <8 x i8> [[VECINIT2_I]], <8 x i8> [[VEXT3_I]], <8 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <8 x i8> [[VECINIT4_I]], <8 x i8> [[VEXT5_I]], <8 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <8 x i8> [[VECINIT6_I]], <8 x i8> [[VEXT7_I]], <8 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <8 x i8> [[VECINIT8_I]], <8 x i8> [[VEXT9_I]], <8 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <8 x i8> [[VECINIT10_I]], <8 x i8> [[VEXT11_I]], <8 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <8 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <8 x i8> [[VECINIT12_I]], <8 x i8> [[VEXT13_I]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[VECINIT14_I]] // mfloat8x8_t test_vmov_n_mf8(mfloat8_t a) { return vmov_n_mf8(a); @@ -154,26 +177,39 @@ mfloat8x8_t test_vmov_n_mf8(mfloat8_t a) { // CHECK-LABEL: define dso_local <16 x i8> @test_vmovq_n_mf8( // CHECK-SAME: <1 x i8> [[A:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[A]] to i8 -// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8 [[TMP0]] to <1 x i8> -// CHECK-NEXT: [[TMP2:%.*]] = bitcast <1 x i8> [[TMP1]] to i8 -// CHECK-NEXT: [[VECINIT_I:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i32 0 -// CHECK-NEXT: [[VECINIT1_I:%.*]] = insertelement <16 x i8> [[VECINIT_I]], i8 [[TMP2]], i32 1 -// CHECK-NEXT: [[VECINIT2_I:%.*]] = insertelement <16 x i8> [[VECINIT1_I]], i8 [[TMP2]], i32 2 -// CHECK-NEXT: [[VECINIT3_I:%.*]] = insertelement <16 x i8> [[VECINIT2_I]], i8 [[TMP2]], i32 3 -// CHECK-NEXT: [[VECINIT4_I:%.*]] = insertelement <16 x i8> [[VECINIT3_I]], i8 [[TMP2]], i32 4 -// CHECK-NEXT: [[VECINIT5_I:%.*]] = insertelement <16 x i8> [[VECINIT4_I]], i8 [[TMP2]], i32 5 -// CHECK-NEXT: [[VECINIT6_I:%.*]] = insertelement <16 x i8> [[VECINIT5_I]], i8 [[TMP2]], i32 6 -// CHECK-NEXT: [[VECINIT7_I:%.*]] = insertelement <16 x i8> [[VECINIT6_I]], i8 [[TMP2]], i32 7 -// CHECK-NEXT: [[VECINIT8_I:%.*]] = insertelement <16 x i8> [[VECINIT7_I]], i8 [[TMP2]], i32 8 -// CHECK-NEXT: [[VECINIT9_I:%.*]] = insertelement <16 x i8> [[VECINIT8_I]], i8 [[TMP2]], i32 9 -// CHECK-NEXT: [[VECINIT10_I:%.*]] = insertelement <16 x i8> [[VECINIT9_I]], i8 [[TMP2]], i32 10 -// CHECK-NEXT: [[VECINIT11_I:%.*]] = insertelement <16 x i8> [[VECINIT10_I]], i8 [[TMP2]], i32 11 -// CHECK-NEXT: [[VECINIT12_I:%.*]] = insertelement <16 x i8> [[VECINIT11_I]], i8 [[TMP2]], i32 12 -// CHECK-NEXT: [[VECINIT13_I:%.*]] = insertelement <16 x i8> [[VECINIT12_I]], i8 [[TMP2]], i32 13 -// CHECK-NEXT: [[VECINIT14_I:%.*]] = insertelement <16 x i8> [[VECINIT13_I]], i8 [[TMP2]], i32 14 -// CHECK-NEXT: [[VECINIT15_I:%.*]] = insertelement <16 x i8> [[VECINIT14_I]], i8 [[TMP2]], i32 15 -// CHECK-NEXT: ret <16 x i8> [[VECINIT15_I]] +// CHECK-NEXT: [[VEXT_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT_I:%.*]] = shufflevector <16 x i8> [[VEXT_I]], <16 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VEXT1_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT2_I:%.*]] = shufflevector <16 x i8> [[VECINIT_I]], <16 x i8> [[VEXT1_I]], <16 x i32> +// CHECK-NEXT: [[VEXT3_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT4_I:%.*]] = shufflevector <16 x i8> [[VECINIT2_I]], <16 x i8> [[VEXT3_I]], <16 x i32> +// CHECK-NEXT: [[VEXT5_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT6_I:%.*]] = shufflevector <16 x i8> [[VECINIT4_I]], <16 x i8> [[VEXT5_I]], <16 x i32> +// CHECK-NEXT: [[VEXT7_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT8_I:%.*]] = shufflevector <16 x i8> [[VECINIT6_I]], <16 x i8> [[VEXT7_I]], <16 x i32> +// CHECK-NEXT: [[VEXT9_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT10_I:%.*]] = shufflevector <16 x i8> [[VECINIT8_I]], <16 x i8> [[VEXT9_I]], <16 x i32> +// CHECK-NEXT: [[VEXT11_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT12_I:%.*]] = shufflevector <16 x i8> [[VECINIT10_I]], <16 x i8> [[VEXT11_I]], <16 x i32> +// CHECK-NEXT: [[VEXT13_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT14_I:%.*]] = shufflevector <16 x i8> [[VECINIT12_I]], <16 x i8> [[VEXT13_I]], <16 x i32> +// CHECK-NEXT: [[VEXT15_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT16_I:%.*]] = shufflevector <16 x i8> [[VECINIT14_I]], <16 x i8> [[VEXT15_I]], <16 x i32> +// CHECK-NEXT: [[VEXT17_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT18_I:%.*]] = shufflevector <16 x i8> [[VECINIT16_I]], <16 x i8> [[VEXT17_I]], <16 x i32> +// CHECK-NEXT: [[VEXT19_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT20_I:%.*]] = shufflevector <16 x i8> [[VECINIT18_I]], <16 x i8> [[VEXT19_I]], <16 x i32> +// CHECK-NEXT: [[VEXT21_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT22_I:%.*]] = shufflevector <16 x i8> [[VECINIT20_I]], <16 x i8> [[VEXT21_I]], <16 x i32> +// CHECK-NEXT: [[VEXT23_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT24_I:%.*]] = shufflevector <16 x i8> [[VECINIT22_I]], <16 x i8> [[VEXT23_I]], <16 x i32> +// CHECK-NEXT: [[VEXT25_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT26_I:%.*]] = shufflevector <16 x i8> [[VECINIT24_I]], <16 x i8> [[VEXT25_I]], <16 x i32> +// CHECK-NEXT: [[VEXT27_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT28_I:%.*]] = shufflevector <16 x i8> [[VECINIT26_I]], <16 x i8> [[VEXT27_I]], <16 x i32> +// CHECK-NEXT: [[VEXT29_I:%.*]] = shufflevector <1 x i8> [[A]], <1 x i8> poison, <16 x i32> +// CHECK-NEXT: [[VECINIT30_I:%.*]] = shufflevector <16 x i8> [[VECINIT28_I]], <16 x i8> [[VEXT29_I]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[VECINIT30_I]] // mfloat8x16_t test_vmovq_n_mf8(mfloat8_t a) { return vmovq_n_mf8(a); @@ -585,7 +621,9 @@ mfloat8x16x2_t test_vuzpq_mf8(mfloat8x16_t a, mfloat8x16_t b) { // CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[ARG_I8X8]], i32 0 -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[TMP1]], i32 0 // CHECK-NEXT: ret void // void test_vcopy_lane_mf8(mfloat8x8_t arg_i8x8) { @@ -596,7 +634,9 @@ void test_vcopy_lane_mf8(mfloat8x8_t arg_i8x8) { // CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]], <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <8 x i8> [[ARG_I8X8]], i32 0 -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[TMP1]], i32 0 // CHECK-NEXT: ret void // void test_vcopyq_lane_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { @@ -607,7 +647,9 @@ void test_vcopyq_lane_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { // CHECK-SAME: <8 x i8> [[ARG_I8X8:%.*]], <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[ARG_I8X16]], i32 0 -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <8 x i8> [[ARG_I8X8]], i8 [[TMP1]], i32 0 // CHECK-NEXT: ret void // void test_vcopy_laneq_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { @@ -618,7 +660,9 @@ void test_vcopy_laneq_mf8(mfloat8x8_t arg_i8x8, mfloat8x16_t arg_i8x16) { // CHECK-SAME: <16 x i8> [[ARG_I8X16:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[VGET_LANE:%.*]] = extractelement <16 x i8> [[ARG_I8X16]], i32 0 -// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[VGET_LANE]], i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8 [[VGET_LANE]] to <1 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <1 x i8> [[TMP0]] to i8 +// CHECK-NEXT: [[VSET_LANE:%.*]] = insertelement <16 x i8> [[ARG_I8X16]], i8 [[TMP1]], i32 0 // CHECK-NEXT: ret void // void test_vcopyq_laneq_mf8(mfloat8x16_t arg_i8x16) { diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c index 0b355db4b2073..2f3994df03784 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fdot.c @@ -49,8 +49,8 @@ svfloat32_t test_svdot_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-LABEL: define dso_local @test_svdot_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -59,8 +59,8 @@ svfloat32_t test_svdot_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-CXX-LABEL: define dso_local @_Z20test_svdot_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -91,8 +91,8 @@ svfloat16_t test_svdot_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-LABEL: define dso_local @test_svdot_n_f16_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -101,8 +101,8 @@ svfloat16_t test_svdot_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm, // CHECK-CXX-LABEL: define dso_local @_Z20test_svdot_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fdot.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c index 0daeeec9e7dd7..425e6a57ffe3c 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_sve2_fp8_fmla.c @@ -49,8 +49,8 @@ svfloat16_t test_svmlalb_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-LABEL: define dso_local @test_svmlalb_n_f16_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -59,8 +59,8 @@ svfloat16_t test_svmlalb_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-CXX-LABEL: define dso_local @_Z22test_svmlalb_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalb.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -91,8 +91,8 @@ svfloat16_t test_svmlalt_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-LABEL: define dso_local @test_svmlalt_n_f16_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -101,8 +101,8 @@ svfloat16_t test_svmlalt_f16_mf8(svfloat16_t zda, svmfloat8_t zn, svmfloat8_t zm // CHECK-CXX-LABEL: define dso_local @_Z22test_svmlalt_n_f16_mf8u13__SVFloat16_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalt.nxv8f16( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -169,8 +169,8 @@ svfloat32_t test_svmlallbb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlallbb_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -179,8 +179,8 @@ svfloat32_t test_svmlallbb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlallbb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -211,8 +211,8 @@ svfloat32_t test_svmlallbt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlallbt_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -221,8 +221,8 @@ svfloat32_t test_svmlallbt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlallbt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlallbt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -253,8 +253,8 @@ svfloat32_t test_svmlalltb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlalltb_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -263,8 +263,8 @@ svfloat32_t test_svmlalltb_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlalltb_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltb.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -295,8 +295,8 @@ svfloat32_t test_svmlalltt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-LABEL: define dso_local @test_svmlalltt_n_f32_mf8( // CHECK-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) @@ -305,8 +305,8 @@ svfloat32_t test_svmlalltt_f32_mf8(svfloat32_t zda, svmfloat8_t zn, svmfloat8_t // CHECK-CXX-LABEL: define dso_local @_Z24test_svmlalltt_n_f32_mf8u13__SVFloat32_tu13__SVMfloat8_tu6__mfp8m( // CHECK-CXX-SAME: [[ZDA:%.*]], [[ZN:%.*]], <1 x i8> [[ZM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = extractelement <1 x i8> [[ZM]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLATINSERT:%.*]] = insertelement poison, i8 [[TMP0]], i64 0 // CHECK-CXX-NEXT: [[DOTSPLAT:%.*]] = shufflevector [[DOTSPLATINSERT]], poison, zeroinitializer // CHECK-CXX-NEXT: [[TMP1:%.*]] = tail call @llvm.aarch64.sve.fp8.fmlalltt.nxv4f32( [[ZDA]], [[ZN]], [[DOTSPLAT]]) diff --git a/clang/test/CodeGen/arm-mfp8.c b/clang/test/CodeGen/arm-mfp8.c index d9e7b5d4707d8..9385b537f18b3 100644 --- a/clang/test/CodeGen/arm-mfp8.c +++ b/clang/test/CodeGen/arm-mfp8.c @@ -38,34 +38,22 @@ mfloat8x8_t test_ret_mfloat8x8_t(mfloat8x8_t v) { // CHECK-C-LABEL: define dso_local <1 x i8> @func1n( // CHECK-C-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 -// CHECK-C-NEXT: [[MFP8_ADDR:%.*]] = alloca i8, align 1 -// CHECK-C-NEXT: [[F1N:%.*]] = alloca [10 x i8], align 1 -// CHECK-C-NEXT: store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1 -// CHECK-C-NEXT: [[TMP0:%.*]] = load i8, ptr [[MFP8_ADDR]], align 1 -// CHECK-C-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-C-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX]], align 1 -// CHECK-C-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-C-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -// CHECK-C-NEXT: store i8 [[TMP1]], ptr [[RETVAL]], align 1 -// CHECK-C-NEXT: [[TMP2:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 -// CHECK-C-NEXT: ret <1 x i8> [[TMP2]] +// CHECK-C-NEXT: [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1 +// CHECK-C-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-C-NEXT: store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1 +// CHECK-C-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 +// CHECK-C-NEXT: ret <1 x i8> [[TMP0]] // // CHECK-CXX-LABEL: define dso_local <1 x i8> @_Z6func1nu6__mfp8( // CHECK-CXX-SAME: <1 x i8> [[MFP8:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[RETVAL:%.*]] = alloca <1 x i8>, align 1 -// CHECK-CXX-NEXT: [[MFP8_ADDR:%.*]] = alloca i8, align 1 -// CHECK-CXX-NEXT: [[F1N:%.*]] = alloca [10 x i8], align 1 -// CHECK-CXX-NEXT: store <1 x i8> [[MFP8]], ptr [[MFP8_ADDR]], align 1 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load i8, ptr [[MFP8_ADDR]], align 1 -// CHECK-CXX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-CXX-NEXT: store i8 [[TMP0]], ptr [[ARRAYIDX]], align 1 -// CHECK-CXX-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x i8], ptr [[F1N]], i64 0, i64 2 -// CHECK-CXX-NEXT: [[TMP1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1 -// CHECK-CXX-NEXT: store i8 [[TMP1]], ptr [[RETVAL]], align 1 -// CHECK-CXX-NEXT: [[TMP2:%.*]] = load <1 x i8>, ptr [[RETVAL]], align 1 -// CHECK-CXX-NEXT: ret <1 x i8> [[TMP2]] +// CHECK-CXX-NEXT: [[F1N:%.*]] = alloca [10 x <1 x i8>], align 1 +// CHECK-CXX-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-CXX-NEXT: store <1 x i8> [[MFP8]], ptr [[ARRAYIDX]], align 1 +// CHECK-CXX-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x <1 x i8>], ptr [[F1N]], i64 0, i64 2 +// CHECK-CXX-NEXT: [[TMP0:%.*]] = load <1 x i8>, ptr [[ARRAYIDX1]], align 1 +// CHECK-CXX-NEXT: ret <1 x i8> [[TMP0]] // __mfp8 func1n(__mfp8 mfp8) { __mfp8 f1n[10]; @@ -98,18 +86,14 @@ mfloat8_t test_extract_element(mfloat8x16_t x, int i) { // CHECK-C-LABEL: define dso_local <16 x i8> @test_insert_element( // CHECK-C-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[V_ADDR:%.*]] = alloca i8, align 1 -// CHECK-C-NEXT: store <1 x i8> [[V]], ptr [[V_ADDR]], align 1 -// CHECK-C-NEXT: [[TMP0:%.*]] = load i8, ptr [[V_ADDR]], align 1 +// CHECK-C-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8 // CHECK-C-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]] // CHECK-C-NEXT: ret <16 x i8> [[VECINS]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z19test_insert_element14__Mfloat8x16_tiu6__mfp8( // CHECK-CXX-SAME: <16 x i8> [[X:%.*]], i32 noundef [[I:%.*]], <1 x i8> [[V:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[V_ADDR:%.*]] = alloca i8, align 1 -// CHECK-CXX-NEXT: store <1 x i8> [[V]], ptr [[V_ADDR]], align 1 -// CHECK-CXX-NEXT: [[TMP0:%.*]] = load i8, ptr [[V_ADDR]], align 1 +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <1 x i8> [[V]] to i8 // CHECK-CXX-NEXT: [[VECINS:%.*]] = insertelement <16 x i8> [[X]], i8 [[TMP0]], i32 [[I]] // CHECK-CXX-NEXT: ret <16 x i8> [[VECINS]] // diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index c3dbef385e778..53bb1c635f01f 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -2059,18 +2059,18 @@ void NeonEmitter::createIntrinsic(const Record *R, // MFloat8 type is only available on AArch64. If encountered set ArchGuard // correctly. - std::string savedArchGuard = ArchGuard; + std::string NewArchGuard = ArchGuard; if (Type(I.first, ".").isMFloat8()) { - if (ArchGuard.empty()) { - ArchGuard = "defined(__aarch64__)"; - } else if (ArchGuard.find("defined(__aarch64__)") == std::string::npos) { - ArchGuard = "defined(__aarch64__) && (" + savedArchGuard + ")"; + if (NewArchGuard.empty()) { + NewArchGuard = "defined(__aarch64__)"; + } else if (NewArchGuard.find("defined(__aarch64__)") == + std::string::npos) { + NewArchGuard = "defined(__aarch64__) && (" + NewArchGuard + ")"; } } Entry.emplace_back(R, Name, Proto, I.first, I.second, CK, Body, *this, - ArchGuard, TargetGuard, IsUnavailable, BigEndianSafe); + NewArchGuard, TargetGuard, IsUnavailable, BigEndianSafe); Out.push_back(&Entry.back()); - ArchGuard = savedArchGuard; } CurrentRecord = nullptr; From 53202b7d14237ac75b831ab5cbb40de8c18638d3 Mon Sep 17 00:00:00 2001 From: Marian Lukac Date: Fri, 9 May 2025 10:07:47 +0000 Subject: [PATCH 4/5] Simplify assertion Change-Id: I9ee2f41ec8879bd631c6ef64e9dc721ef22cf2a1 --- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index a38436611a1ed..db22da72ea8a6 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -4190,11 +4190,9 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, // to extract sclar element type is necessary. if (MemEltTy->isVectorTy()) { #ifndef NDEBUG - auto *VecTy = cast(MemEltTy); - ElementCount EC = VecTy->getElementCount(); - assert(EC.isScalar() && VecTy->getElementType() == Int8Ty && - "Only <1 x i8> expected"); - #endif + assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && + "Only <1 x i8> expected"); +#endif MemEltTy = cast(MemEltTy)->getElementType(); } @@ -4249,11 +4247,9 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, // to extract sclar element type is necessary. if (MemEltTy->isVectorTy()) { #ifndef NDEBUG - auto *VecTy = cast(MemEltTy); - ElementCount EC = VecTy->getElementCount(); - assert(EC.isScalar() && VecTy->getElementType() == Int8Ty && - "Only <1 x i8> expected"); - #endif + assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && + "Only <1 x i8> expected"); +#endif MemEltTy = cast(MemEltTy)->getElementType(); } From cd8af9dc7cc745f47774c5061aa9d6ca33df7f0e Mon Sep 17 00:00:00 2001 From: Marian Lukac Date: Fri, 9 May 2025 10:14:47 +0000 Subject: [PATCH 5/5] remove debug guards Change-Id: Ic460c0e6afdccdc37ef31f78cde9933cdcb3c544 --- clang/lib/CodeGen/TargetBuiltins/ARM.cpp | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp index db22da72ea8a6..b95d8cb50374b 100644 --- a/clang/lib/CodeGen/TargetBuiltins/ARM.cpp +++ b/clang/lib/CodeGen/TargetBuiltins/ARM.cpp @@ -4189,11 +4189,9 @@ Value *CodeGenFunction::EmitSVEMaskedLoad(const CallExpr *E, // Mfloat8 types is stored as a vector, so extra work // to extract sclar element type is necessary. if (MemEltTy->isVectorTy()) { - #ifndef NDEBUG assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && "Only <1 x i8> expected"); -#endif - MemEltTy = cast(MemEltTy)->getElementType(); + MemEltTy = cast(MemEltTy)->getElementType(); } // The vector type that is returned may be different from the @@ -4246,11 +4244,9 @@ Value *CodeGenFunction::EmitSVEMaskedStore(const CallExpr *E, // Mfloat8 types is stored as a vector, so extra work // to extract sclar element type is necessary. if (MemEltTy->isVectorTy()) { - #ifndef NDEBUG assert(MemEltTy == FixedVectorType::get(Int8Ty, 1) && "Only <1 x i8> expected"); -#endif - MemEltTy = cast(MemEltTy)->getElementType(); + MemEltTy = cast(MemEltTy)->getElementType(); } // The vector type that is stored may be different from the