From e8a216c5effbf426ada5b9deb89fc2b5d9405f7c Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Fri, 6 Dec 2024 13:09:23 +0000 Subject: [PATCH 01/16] [AArch64] Refactor implementation of FP8 types (NFC) * The FP8 scalar type (`__mfp8`) was described as a vector type * The FP8 vector types were described/assumed to have integer element type (the element type ought to be `__mfp8`), * Add support for `m` type specifier (denoting `__mfp8`) in `DecodeTypeFromStr` and create SVE builtin prototypes using the specifier, instead of `int8_t`. --- clang/include/clang/AST/Type.h | 5 +++ .../clang/Basic/AArch64SVEACLETypes.def | 24 +++++++++--- clang/lib/AST/ASTContext.cpp | 37 +++++++++++++++---- clang/lib/AST/ItaniumMangle.cpp | 5 +++ clang/lib/AST/Type.cpp | 4 +- clang/lib/CodeGen/CodeGenTypes.cpp | 13 +++++-- clang/lib/CodeGen/Targets/AArch64.cpp | 7 +++- clang/utils/TableGen/SveEmitter.cpp | 4 +- 8 files changed, 76 insertions(+), 23 deletions(-) diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index 09c98f642852f..aa313719a6575 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2518,6 +2518,7 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { bool isFloat32Type() const; bool isDoubleType() const; bool isBFloat16Type() const; + bool isMFloat8Type() const; bool isFloat128Type() const; bool isIbm128Type() const; bool isRealType() const; // C99 6.2.5p17 (real floating + integer) @@ -8532,6 +8533,10 @@ inline bool Type::isBFloat16Type() const { return isSpecificBuiltinType(BuiltinType::BFloat16); } +inline bool Type::isMFloat8Type() const { + return isSpecificBuiltinType(BuiltinType::MFloat8); +} + inline bool Type::isFloat128Type() const { return isSpecificBuiltinType(BuiltinType::Float128); } diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def index 063cac1f4a58e..6b704b386536c 100644 --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -57,6 +57,11 @@ // - IsBF true for vector of brain float elements. //===----------------------------------------------------------------------===// +#ifndef SVE_SCALAR_TYPE +#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \ + SVE_TYPE(Name, Id, SingletonId) +#endif + #ifndef SVE_VECTOR_TYPE #define SVE_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ SVE_TYPE(Name, Id, SingletonId) @@ -72,6 +77,11 @@ SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, true) #endif +#ifndef SVE_VECTOR_TYPE_MFLOAT +#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \ + SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, false, false) +#endif + #ifndef SVE_VECTOR_TYPE_FLOAT #define SVE_VECTOR_TYPE_FLOAT(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF) \ SVE_VECTOR_TYPE_DETAILS(Name, MangledName, Id, SingletonId, NumEls, ElBits, NF, false, true, false) @@ -125,8 +135,7 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1) -// This is a 8 bits opaque type. -SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t", SveMFloat8, SveMFloat8Ty, 16, 8, 1, false) +SVE_VECTOR_TYPE_MFLOAT("__SVMfloat8_t", "__SVMfloat8_t", SveMFloat8, SveMFloat8Ty, 16, 8, 1) // // x2 @@ -148,7 +157,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 8, 16, 2) -SVE_VECTOR_TYPE_INT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2, false) +SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x2_t", "svmfloat8x2_t", SveMFloat8x2, SveMFloat8x2Ty, 16, 8, 2) // // x3 @@ -170,7 +179,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 8, 16, 3) -SVE_VECTOR_TYPE_INT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3, false) +SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x3_t", "svmfloat8x3_t", SveMFloat8x3, SveMFloat8x3Ty, 16, 8, 3) // // x4 @@ -192,7 +201,7 @@ SVE_VECTOR_TYPE_FLOAT("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, Sv SVE_VECTOR_TYPE_BFLOAT("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 8, 16, 4) -SVE_VECTOR_TYPE_INT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4, false) +SVE_VECTOR_TYPE_MFLOAT("__clang_svmfloat8x4_t", "svmfloat8x4_t", SveMFloat8x4, SveMFloat8x4Ty, 16, 8, 4) SVE_PREDICATE_TYPE_ALL("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16, 1) SVE_PREDICATE_TYPE_ALL("__clang_svboolx2_t", "svboolx2_t", SveBoolx2, SveBoolx2Ty, 16, 2) @@ -200,11 +209,13 @@ SVE_PREDICATE_TYPE_ALL("__clang_svboolx4_t", "svboolx4_t", SveBoolx4, SveBoolx4T SVE_OPAQUE_TYPE("__SVCount_t", "__SVCount_t", SveCount, SveCountTy) -AARCH64_VECTOR_TYPE_MFLOAT("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 1, 8, 1) +SVE_SCALAR_TYPE("__mfp8", "__mfp8", MFloat8, MFloat8Ty, 8) + AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x8_t", "__MFloat8x8_t", MFloat8x8, MFloat8x8Ty, 8, 8, 1) AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloat8x16Ty, 16, 8, 1) #undef SVE_VECTOR_TYPE +#undef SVE_VECTOR_TYPE_MFLOAT #undef SVE_VECTOR_TYPE_BFLOAT #undef SVE_VECTOR_TYPE_FLOAT #undef SVE_VECTOR_TYPE_INT @@ -213,4 +224,5 @@ AARCH64_VECTOR_TYPE_MFLOAT("__MFloat8x16_t", "__MFloat8x16_t", MFloat8x16, MFloa #undef SVE_OPAQUE_TYPE #undef AARCH64_VECTOR_TYPE_MFLOAT #undef AARCH64_VECTOR_TYPE +#undef SVE_SCALAR_TYPE #undef SVE_TYPE diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index 8f04b58841964..df12c110bf357 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -2275,6 +2275,11 @@ TypeInfo ASTContext::getTypeInfoImpl(const Type *T) const { Width = NumEls * ElBits * NF; \ Align = NumEls * ElBits; \ break; +#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \ + case BuiltinType::Id: \ + Width = Bits; \ + Align = Bits; \ + break; #include "clang/Basic/AArch64SVEACLETypes.def" #define PPC_VECTOR_TYPE(Name, Id, Size) \ case BuiltinType::Id: \ @@ -4395,15 +4400,18 @@ ASTContext::getBuiltinVectorTypeInfo(const BuiltinType *Ty) const { ElBits, NF) \ case BuiltinType::Id: \ return {BFloat16Ty, llvm::ElementCount::getScalable(NumEls), NF}; +#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + case BuiltinType::Id: \ + return {MFloat8Ty, llvm::ElementCount::getScalable(NumEls), NF}; #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ case BuiltinType::Id: \ return {BoolTy, llvm::ElementCount::getScalable(NumEls), NF}; #define AARCH64_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ ElBits, NF) \ case BuiltinType::Id: \ - return {getIntTypeForBitwidth(ElBits, false), \ - llvm::ElementCount::getFixed(NumEls), NF}; -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) + return {MFloat8Ty, llvm::ElementCount::getFixed(NumEls), NF}; +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" #define RVV_VECTOR_TYPE_INT(Name, Id, SingletonId, NumEls, ElBits, NF, \ @@ -4465,11 +4473,16 @@ QualType ASTContext::getScalableVectorType(QualType EltTy, unsigned NumElts, EltTySize == ElBits && NumElts == (NumEls * NF) && NumFields == 1) { \ return SingletonId; \ } +#define SVE_VECTOR_TYPE_MFLOAT(Name, MangledName, Id, SingletonId, NumEls, \ + ElBits, NF) \ + if (EltTy->isMFloat8Type() && EltTySize == ElBits && \ + NumElts == (NumEls * NF) && NumFields == 1) { \ + return SingletonId; \ + } #define SVE_PREDICATE_TYPE_ALL(Name, MangledName, Id, SingletonId, NumEls, NF) \ if (EltTy->isBooleanType() && NumElts == (NumEls * NF) && NumFields == 1) \ return SingletonId; -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) -#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" } else if (Target->hasRISCVVTypes()) { uint64_t EltTySize = getTypeSize(EltTy); @@ -12216,8 +12229,15 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, RequiresICE, false); assert(!RequiresICE && "Can't require vector ICE"); - // TODO: No way to make AltiVec vectors in builtins yet. - Type = Context.getVectorType(ElementType, NumElements, VectorKind::Generic); + if (ElementType == Context.MFloat8Ty) { + assert((NumElements == 8 || NumElements == 16) && + "Invalid number of elements"); + Type = NumElements == 8 ? Context.MFloat8x8Ty : Context.MFloat8x16Ty; + } else { + // TODO: No way to make AltiVec vectors in builtins yet. + Type = + Context.getVectorType(ElementType, NumElements, VectorKind::Generic); + } break; } case 'E': { @@ -12273,6 +12293,9 @@ static QualType DecodeTypeFromStr(const char *&Str, const ASTContext &Context, case 'p': Type = Context.getProcessIDType(); break; + case 'm': + Type = Context.MFloat8Ty; + break; } // If there are modifiers and if we're allowed to parse them, go for it. diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp index 47aa9b40dab84..9404f9fd9b151 100644 --- a/clang/lib/AST/ItaniumMangle.cpp +++ b/clang/lib/AST/ItaniumMangle.cpp @@ -3438,6 +3438,11 @@ void CXXNameMangler::mangleType(const BuiltinType *T) { type_name = MangledName; \ Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ break; +#define SVE_SCALAR_TYPE(Name, MangledName, Id, SingletonId, Bits) \ + case BuiltinType::Id: \ + type_name = MangledName; \ + Out << (type_name == Name ? "u" : "") << type_name.size() << type_name; \ + break; #include "clang/Basic/AArch64SVEACLETypes.def" #define PPC_VECTOR_TYPE(Name, Id, Size) \ case BuiltinType::Id: \ diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index caa0ac858a1be..fde0746a17570 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2527,9 +2527,7 @@ bool Type::isSVESizelessBuiltinType() const { #define SVE_PREDICATE_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: \ return true; -#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ - case BuiltinType::Id: \ - return false; +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" default: return false; diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index 09191a4901f49..fd3327cf9acd8 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -507,13 +507,15 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { case BuiltinType::Id: #define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ case BuiltinType::Id: -#define SVE_OPAQUE_TYPE(Name, MangledName, Id, SingletonId) +#define SVE_TYPE(Name, Id, SingletonId) #include "clang/Basic/AArch64SVEACLETypes.def" { ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(cast(Ty)); - auto VTy = - llvm::VectorType::get(ConvertType(Info.ElementType), Info.EC); + auto *EltTy = Info.ElementType->isMFloat8Type() + ? llvm::Type::getInt8Ty(getLLVMContext()) + : ConvertType(Info.ElementType); + auto *VTy = llvm::VectorType::get(EltTy, Info.EC); switch (Info.NumVectors) { default: llvm_unreachable("Expected 1, 2, 3 or 4 vectors!"); @@ -529,6 +531,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { } case BuiltinType::SveCount: return llvm::TargetExtType::get(getLLVMContext(), "aarch64.svcount"); + case BuiltinType::MFloat8: + return llvm::VectorType::get(llvm::Type::getInt8Ty(getLLVMContext()), 1, + false); #define PPC_VECTOR_TYPE(Name, Id, Size) \ case BuiltinType::Id: \ ResultType = \ @@ -650,6 +655,8 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { // An ext_vector_type of Bool is really a vector of bits. llvm::Type *IRElemTy = VT->isExtVectorBoolType() ? llvm::Type::getInt1Ty(getLLVMContext()) + : VT->getElementType()->isMFloat8Type() + ? llvm::Type::getInt8Ty(getLLVMContext()) : ConvertType(VT->getElementType()); ResultType = llvm::FixedVectorType::get(IRElemTy, VT->getNumElements()); break; diff --git a/clang/lib/CodeGen/Targets/AArch64.cpp b/clang/lib/CodeGen/Targets/AArch64.cpp index 7db67ecba07c8..7158b94a2b3d6 100644 --- a/clang/lib/CodeGen/Targets/AArch64.cpp +++ b/clang/lib/CodeGen/Targets/AArch64.cpp @@ -244,6 +244,7 @@ AArch64ABIInfo::convertFixedToScalableVectorType(const VectorType *VT) const { case BuiltinType::SChar: case BuiltinType::UChar: + case BuiltinType::MFloat8: return llvm::ScalableVectorType::get( llvm::Type::getInt8Ty(getVMContext()), 16); @@ -781,8 +782,10 @@ bool AArch64ABIInfo::passAsPureScalableType( NPred += Info.NumVectors; else NVec += Info.NumVectors; - auto VTy = llvm::ScalableVectorType::get(CGT.ConvertType(Info.ElementType), - Info.EC.getKnownMinValue()); + llvm::Type *EltTy = Info.ElementType->isMFloat8Type() + ? llvm::Type::getInt8Ty(getVMContext()) + : CGT.ConvertType(Info.ElementType); + auto *VTy = llvm::ScalableVectorType::get(EltTy, Info.EC.getKnownMinValue()); if (CoerceToSeq.size() + Info.NumVectors > 12) return false; diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp index 97b768db3a313..3cd7aed9469ab 100644 --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -448,7 +448,7 @@ std::string SVEType::builtinBaseType() const { case TypeKind::PredicatePattern: return "i"; case TypeKind::Fpm: - return "Wi"; + return "UWi"; case TypeKind::Predicate: return "b"; case TypeKind::BFloat16: @@ -456,7 +456,7 @@ std::string SVEType::builtinBaseType() const { return "y"; case TypeKind::MFloat8: assert(ElementBitwidth == 8 && "Invalid MFloat8!"); - return "c"; + return "m"; case TypeKind::Float: switch (ElementBitwidth) { case 16: From 01ee70b38a31e3c9e19fdc1fd70d68ee1fe4b83f Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 13 Jan 2025 16:28:20 +0000 Subject: [PATCH 02/16] [fixup] Add a comment about special case of mapping FP8 vectors to LLVM vector types --- clang/lib/CodeGen/CodeGenTypes.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp index fd3327cf9acd8..5b3048ae2f21c 100644 --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -512,6 +512,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) { { ASTContext::BuiltinVectorTypeInfo Info = Context.getBuiltinVectorTypeInfo(cast(Ty)); + // The `__mfp8` type maps to `<1 x i8>` which can't be used to build + // a vector type, hence bypass the call to `ConvertType` for + // the element type and create the vector type directly. auto *EltTy = Info.ElementType->isMFloat8Type() ? llvm::Type::getInt8Ty(getLLVMContext()) : ConvertType(Info.ElementType); From a4d998ffa444f7ba539a6375f6043ac80cdef05c Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Fri, 6 Dec 2024 15:44:58 +0000 Subject: [PATCH 03/16] [Clang][AArch64] Allow FP8 Neon vector types to be used by __builtin_shufflevector The Neon vector types for FP8 (`__MFloat8x8_t` and `__MFloat8x16_t`) are implemented as builtin types and need a special case in `__builtin_shufflevector`. --- clang/include/clang/AST/Type.h | 4 + .../clang/Basic/DiagnosticSemaKinds.td | 5 + clang/lib/AST/Type.cpp | 13 ++ clang/lib/Sema/SemaChecking.cpp | 39 +++++- .../AArch64/builtin-shufflevector-fp8.c | 123 ++++++++++++++++++ clang/test/Sema/builtin-shufflevector.c | 30 +++++ 6 files changed, 208 insertions(+), 6 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c create mode 100644 clang/test/Sema/builtin-shufflevector.c diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h index aa313719a6575..fbc62f61ad5a5 100644 --- a/clang/include/clang/AST/Type.h +++ b/clang/include/clang/AST/Type.h @@ -2404,6 +2404,10 @@ class alignas(TypeAlignment) Type : public ExtQualsTypeCommonBase { /// SVE vector or predicate, excluding tuple types such as svint32x4_t. bool isSveVLSBuiltinType() const; + /// Determines if this is a *builtin* NEON vector type, a type not built with + /// `neon_vector_type` + bool isNeonVectorBuiltinType() const; + /// Returns the representative type for the element of an SVE builtin type. /// This is used to represent fixed-length SVE vectors created with the /// 'arm_sve_vector_bits' type attribute as VectorType. diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index d4e897868f1a9..b43f2d9143117 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10564,6 +10564,9 @@ def err_vec_builtin_incompatible_vector : Error< def err_vsx_builtin_nonconstant_argument : Error< "argument %0 to %1 must be a 2-bit unsigned literal (i.e. 0, 1, 2 or 3)">; +def err_shufflevector_incompatible_index_vector : Error< + "second argument for __builtin_shufflevector must be integer vector " + "with length equal to the length of the first argument">; def err_shufflevector_nonconstant_argument : Error< "index for __builtin_shufflevector must be a constant integer">; def err_shufflevector_argument_too_large : Error< @@ -10571,6 +10574,8 @@ def err_shufflevector_argument_too_large : Error< "of vector elements">; def err_shufflevector_minus_one_is_undefined_behavior_constexpr : Error< "index for __builtin_shufflevector not within the bounds of the input vectors; index of -1 found at position %0 is not permitted in a constexpr context">; +def err_shufflevector_unsupported_result_vector_type : Error< + "unsupported vector type for the result">; def err_convertvector_non_vector : Error< "first argument to __builtin_convertvector must be a vector">; diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index fde0746a17570..fb55bab0a67de 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2576,6 +2576,19 @@ bool Type::isSveVLSBuiltinType() const { return false; } +bool Type::isNeonVectorBuiltinType() const { + if (const BuiltinType *BT = getAs()) { + switch (BT->getKind()) { + case BuiltinType::MFloat8x8: + case BuiltinType::MFloat8x16: + return true; + default: + return false; + } + } + return false; +} + QualType Type::getSizelessVectorEltType(const ASTContext &Ctx) const { assert(isSizelessVectorType() && "Must be sizeless vector type"); // Currently supports SVE and RVV diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 28dcfaac2e84f..1a407c7a6cece 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5130,24 +5130,32 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) { QualType LHSType = TheCall->getArg(0)->getType(); QualType RHSType = TheCall->getArg(1)->getType(); - if (!LHSType->isVectorType() || !RHSType->isVectorType()) + if (!LHSType->isVectorType() && !LHSType->isNeonVectorBuiltinType()) return ExprError( - Diag(TheCall->getBeginLoc(), diag::err_vec_builtin_non_vector) - << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false + Diag(TheCall->getBeginLoc(), diag::err_builtin_non_vector_type) + << "first" << TheCall->getDirectCallee() + << /*isMorethantwoArgs*/ false << SourceRange(TheCall->getArg(0)->getBeginLoc(), TheCall->getArg(1)->getEndLoc())); - numElements = LHSType->castAs()->getNumElements(); + if (auto *Ty = LHSType->getAs()) { + assert(Ty->getKind() == BuiltinType::MFloat8x8 || + Ty->getKind() == BuiltinType::MFloat8x16); + numElements = Ty->getKind() == BuiltinType::MFloat8x8 ? 8 : 16; + } else { + numElements = LHSType->castAs()->getNumElements(); + } + unsigned numResElements = TheCall->getNumArgs() - 2; // Check to see if we have a call with 2 vector arguments, the unary shuffle // with mask. If so, verify that RHS is an integer vector type with the // same number of elts as lhs. if (TheCall->getNumArgs() == 2) { - if (!RHSType->hasIntegerRepresentation() || + if (!RHSType->isVectorType() || !RHSType->hasIntegerRepresentation() || RHSType->castAs()->getNumElements() != numElements) return ExprError(Diag(TheCall->getBeginLoc(), - diag::err_vec_builtin_incompatible_vector) + diag::err_shufflevector_incompatible_index_vector) << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false << SourceRange(TheCall->getArg(1)->getBeginLoc(), @@ -5160,6 +5168,25 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) { << SourceRange(TheCall->getArg(0)->getBeginLoc(), TheCall->getArg(1)->getEndLoc())); } else if (numElements != numResElements) { + if (auto *Ty = LHSType->getAs()) { + assert(Ty->getKind() == BuiltinType::MFloat8x8 || + Ty->getKind() == BuiltinType::MFloat8x16); + switch (numResElements) { + case 8: + resType = Context.MFloat8x8Ty; + break; + case 16: + resType = Context.MFloat8x16Ty; + break; + default: + return ExprError(Diag(TheCall->getBeginLoc(), + diag::err_shufflevector_unsupported_result_vector_type) + << TheCall->getDirectCallee() + << /*isMorethantwoArgs*/ false + << SourceRange(TheCall->getArg(0)->getBeginLoc(), + TheCall->getArg(1)->getEndLoc())); + } + } QualType eltType = LHSType->castAs()->getElementType(); resType = Context.getVectorType(eltType, numResElements, VectorKind::Generic); diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c new file mode 100644 index 0000000000000..45ea812750953 --- /dev/null +++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c @@ -0,0 +1,123 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-linux -target-feature +neon -disable-O0-optnone -emit-llvm -o - %s | opt -S -passes=mem2reg | FileCheck %s + +// REQUIRES: aarch64-registered-target + +typedef __attribute__((neon_vector_type(8))) signed char int8x8_t; +typedef __attribute__((neon_vector_type(16))) signed char int8x16_t; + +typedef __MFloat8x8_t mfloat8x8_t; +typedef __MFloat8x16_t mfloat8x16_t; + +// CHECK-LABEL: define dso_local <8 x i8> @f0( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> +// CHECK-NEXT: ret <8 x i8> [[SHUFFLE]] +// +mfloat8x8_t f0(mfloat8x8_t x) { + return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0); +} + +// CHECK-LABEL: define dso_local <8 x i8> @f1( +// CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7) +// CHECK-NEXT: [[SHUF_IDX:%.*]] = extractelement <8 x i8> [[MASK]], i64 0 +// CHECK-NEXT: [[SHUF_ELT:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX]] +// CHECK-NEXT: [[SHUF_INS:%.*]] = insertelement <8 x i8> poison, i8 [[SHUF_ELT]], i64 0 +// CHECK-NEXT: [[SHUF_IDX1:%.*]] = extractelement <8 x i8> [[MASK]], i64 1 +// CHECK-NEXT: [[SHUF_ELT2:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX1]] +// CHECK-NEXT: [[SHUF_INS3:%.*]] = insertelement <8 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1 +// CHECK-NEXT: [[SHUF_IDX4:%.*]] = extractelement <8 x i8> [[MASK]], i64 2 +// CHECK-NEXT: [[SHUF_ELT5:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX4]] +// CHECK-NEXT: [[SHUF_INS6:%.*]] = insertelement <8 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2 +// CHECK-NEXT: [[SHUF_IDX7:%.*]] = extractelement <8 x i8> [[MASK]], i64 3 +// CHECK-NEXT: [[SHUF_ELT8:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX7]] +// CHECK-NEXT: [[SHUF_INS9:%.*]] = insertelement <8 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3 +// CHECK-NEXT: [[SHUF_IDX10:%.*]] = extractelement <8 x i8> [[MASK]], i64 4 +// CHECK-NEXT: [[SHUF_ELT11:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX10]] +// CHECK-NEXT: [[SHUF_INS12:%.*]] = insertelement <8 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4 +// CHECK-NEXT: [[SHUF_IDX13:%.*]] = extractelement <8 x i8> [[MASK]], i64 5 +// CHECK-NEXT: [[SHUF_ELT14:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX13]] +// CHECK-NEXT: [[SHUF_INS15:%.*]] = insertelement <8 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5 +// CHECK-NEXT: [[SHUF_IDX16:%.*]] = extractelement <8 x i8> [[MASK]], i64 6 +// CHECK-NEXT: [[SHUF_ELT17:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX16]] +// CHECK-NEXT: [[SHUF_INS18:%.*]] = insertelement <8 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6 +// CHECK-NEXT: [[SHUF_IDX19:%.*]] = extractelement <8 x i8> [[MASK]], i64 7 +// CHECK-NEXT: [[SHUF_ELT20:%.*]] = extractelement <8 x i8> [[X]], i8 [[SHUF_IDX19]] +// CHECK-NEXT: [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7 +// CHECK-NEXT: ret <8 x i8> [[SHUF_INS21]] +// +mfloat8x8_t f1(mfloat8x8_t x, int8x8_t p) { + return __builtin_shufflevector(x, p); +} + +// CHECK-LABEL: define dso_local <16 x i8> @f3( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> +// CHECK-NEXT: ret <16 x i8> [[SHUFFLE]] +// +mfloat8x16_t f3(mfloat8x16_t x) { + return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, + 1, 0); +} + +// CHECK-LABEL: define dso_local <16 x i8> @f4( +// CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15) +// CHECK-NEXT: [[SHUF_IDX:%.*]] = extractelement <16 x i8> [[MASK]], i64 0 +// CHECK-NEXT: [[SHUF_ELT:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX]] +// CHECK-NEXT: [[SHUF_INS:%.*]] = insertelement <16 x i8> poison, i8 [[SHUF_ELT]], i64 0 +// CHECK-NEXT: [[SHUF_IDX1:%.*]] = extractelement <16 x i8> [[MASK]], i64 1 +// CHECK-NEXT: [[SHUF_ELT2:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX1]] +// CHECK-NEXT: [[SHUF_INS3:%.*]] = insertelement <16 x i8> [[SHUF_INS]], i8 [[SHUF_ELT2]], i64 1 +// CHECK-NEXT: [[SHUF_IDX4:%.*]] = extractelement <16 x i8> [[MASK]], i64 2 +// CHECK-NEXT: [[SHUF_ELT5:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX4]] +// CHECK-NEXT: [[SHUF_INS6:%.*]] = insertelement <16 x i8> [[SHUF_INS3]], i8 [[SHUF_ELT5]], i64 2 +// CHECK-NEXT: [[SHUF_IDX7:%.*]] = extractelement <16 x i8> [[MASK]], i64 3 +// CHECK-NEXT: [[SHUF_ELT8:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX7]] +// CHECK-NEXT: [[SHUF_INS9:%.*]] = insertelement <16 x i8> [[SHUF_INS6]], i8 [[SHUF_ELT8]], i64 3 +// CHECK-NEXT: [[SHUF_IDX10:%.*]] = extractelement <16 x i8> [[MASK]], i64 4 +// CHECK-NEXT: [[SHUF_ELT11:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX10]] +// CHECK-NEXT: [[SHUF_INS12:%.*]] = insertelement <16 x i8> [[SHUF_INS9]], i8 [[SHUF_ELT11]], i64 4 +// CHECK-NEXT: [[SHUF_IDX13:%.*]] = extractelement <16 x i8> [[MASK]], i64 5 +// CHECK-NEXT: [[SHUF_ELT14:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX13]] +// CHECK-NEXT: [[SHUF_INS15:%.*]] = insertelement <16 x i8> [[SHUF_INS12]], i8 [[SHUF_ELT14]], i64 5 +// CHECK-NEXT: [[SHUF_IDX16:%.*]] = extractelement <16 x i8> [[MASK]], i64 6 +// CHECK-NEXT: [[SHUF_ELT17:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX16]] +// CHECK-NEXT: [[SHUF_INS18:%.*]] = insertelement <16 x i8> [[SHUF_INS15]], i8 [[SHUF_ELT17]], i64 6 +// CHECK-NEXT: [[SHUF_IDX19:%.*]] = extractelement <16 x i8> [[MASK]], i64 7 +// CHECK-NEXT: [[SHUF_ELT20:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX19]] +// CHECK-NEXT: [[SHUF_INS21:%.*]] = insertelement <16 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7 +// CHECK-NEXT: [[SHUF_IDX22:%.*]] = extractelement <16 x i8> [[MASK]], i64 8 +// CHECK-NEXT: [[SHUF_ELT23:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX22]] +// CHECK-NEXT: [[SHUF_INS24:%.*]] = insertelement <16 x i8> [[SHUF_INS21]], i8 [[SHUF_ELT23]], i64 8 +// CHECK-NEXT: [[SHUF_IDX25:%.*]] = extractelement <16 x i8> [[MASK]], i64 9 +// CHECK-NEXT: [[SHUF_ELT26:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX25]] +// CHECK-NEXT: [[SHUF_INS27:%.*]] = insertelement <16 x i8> [[SHUF_INS24]], i8 [[SHUF_ELT26]], i64 9 +// CHECK-NEXT: [[SHUF_IDX28:%.*]] = extractelement <16 x i8> [[MASK]], i64 10 +// CHECK-NEXT: [[SHUF_ELT29:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX28]] +// CHECK-NEXT: [[SHUF_INS30:%.*]] = insertelement <16 x i8> [[SHUF_INS27]], i8 [[SHUF_ELT29]], i64 10 +// CHECK-NEXT: [[SHUF_IDX31:%.*]] = extractelement <16 x i8> [[MASK]], i64 11 +// CHECK-NEXT: [[SHUF_ELT32:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX31]] +// CHECK-NEXT: [[SHUF_INS33:%.*]] = insertelement <16 x i8> [[SHUF_INS30]], i8 [[SHUF_ELT32]], i64 11 +// CHECK-NEXT: [[SHUF_IDX34:%.*]] = extractelement <16 x i8> [[MASK]], i64 12 +// CHECK-NEXT: [[SHUF_ELT35:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX34]] +// CHECK-NEXT: [[SHUF_INS36:%.*]] = insertelement <16 x i8> [[SHUF_INS33]], i8 [[SHUF_ELT35]], i64 12 +// CHECK-NEXT: [[SHUF_IDX37:%.*]] = extractelement <16 x i8> [[MASK]], i64 13 +// CHECK-NEXT: [[SHUF_ELT38:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX37]] +// CHECK-NEXT: [[SHUF_INS39:%.*]] = insertelement <16 x i8> [[SHUF_INS36]], i8 [[SHUF_ELT38]], i64 13 +// CHECK-NEXT: [[SHUF_IDX40:%.*]] = extractelement <16 x i8> [[MASK]], i64 14 +// CHECK-NEXT: [[SHUF_ELT41:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX40]] +// CHECK-NEXT: [[SHUF_INS42:%.*]] = insertelement <16 x i8> [[SHUF_INS39]], i8 [[SHUF_ELT41]], i64 14 +// CHECK-NEXT: [[SHUF_IDX43:%.*]] = extractelement <16 x i8> [[MASK]], i64 15 +// CHECK-NEXT: [[SHUF_ELT44:%.*]] = extractelement <16 x i8> [[X]], i8 [[SHUF_IDX43]] +// CHECK-NEXT: [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15 +// CHECK-NEXT: ret <16 x i8> [[SHUF_INS45]] +// +mfloat8x16_t f4(mfloat8x16_t x, int8x16_t p) { + return __builtin_shufflevector(x, p); +} diff --git a/clang/test/Sema/builtin-shufflevector.c b/clang/test/Sema/builtin-shufflevector.c new file mode 100644 index 0000000000000..c2dabb9d6585a --- /dev/null +++ b/clang/test/Sema/builtin-shufflevector.c @@ -0,0 +1,30 @@ +// RUN: %clang_cc1 -triple aarch64 -fsyntax-only -verify %s + +// REQUIRES: aarch64-registered-target + +typedef __attribute__((neon_vector_type(8))) signed char int8x8_t; +typedef __attribute__((neon_vector_type(16))) signed char int8x16_t; + +typedef __MFloat8x8_t mfloat8x8_t; +typedef __MFloat8x16_t mfloat8x16_t; + +int8x8_t non_vector(int x) { + return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0); + // expected-error@-1 {{first argument to '__builtin_shufflevector' must be of vector type}} +} + +mfloat8x8_t unsuported_vector(mfloat8x8_t x) { + return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0, 0); + // expected-error@-1 {{unsupported vector type for the result}} +} + +int8x8_t non_vector_index(int8x8_t x, int p) { + return __builtin_shufflevector(x, p); + // expected-error@-1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}} +} + +int8x8_t bad_vector_index_length(int8x8_t x, int8x16_t p) { + return __builtin_shufflevector(x, p); + // expected-error@-1 {{second argument for __builtin_shufflevector must be integer vector with length equal to the length of the first argument}} +} + From 9496937a20f0ac206188bd4732da106cef670aaa Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 16 Dec 2024 09:58:22 +0000 Subject: [PATCH 04/16] [fixup] Fix formatting (NFC) --- clang/lib/Sema/SemaChecking.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp index 1a407c7a6cece..8093a80dcc33b 100644 --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -5179,12 +5179,12 @@ ExprResult Sema::BuiltinShuffleVector(CallExpr *TheCall) { resType = Context.MFloat8x16Ty; break; default: - return ExprError(Diag(TheCall->getBeginLoc(), - diag::err_shufflevector_unsupported_result_vector_type) - << TheCall->getDirectCallee() - << /*isMorethantwoArgs*/ false - << SourceRange(TheCall->getArg(0)->getBeginLoc(), - TheCall->getArg(1)->getEndLoc())); + return ExprError( + Diag(TheCall->getBeginLoc(), + diag::err_shufflevector_unsupported_result_vector_type) + << TheCall->getDirectCallee() << /*isMorethantwoArgs*/ false + << SourceRange(TheCall->getArg(0)->getBeginLoc(), + TheCall->getArg(1)->getEndLoc())); } } QualType eltType = LHSType->castAs()->getElementType(); From e6da3ea7803a08cc796f7bbe5254f2350b6050cc Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 13 Jan 2025 17:03:18 +0000 Subject: [PATCH 05/16] [fixup] Address review comments, minor tweaks --- clang/lib/AST/Type.cpp | 6 ++++-- .../CodeGen/AArch64/builtin-shufflevector-fp8.c | 16 ++++++++-------- clang/test/Sema/builtin-shufflevector.c | 1 - 3 files changed, 12 insertions(+), 11 deletions(-) diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp index fb55bab0a67de..a1dea017c866e 100644 --- a/clang/lib/AST/Type.cpp +++ b/clang/lib/AST/Type.cpp @@ -2579,8 +2579,10 @@ bool Type::isSveVLSBuiltinType() const { bool Type::isNeonVectorBuiltinType() const { if (const BuiltinType *BT = getAs()) { switch (BT->getKind()) { - case BuiltinType::MFloat8x8: - case BuiltinType::MFloat8x16: +#define AARCH64_VECTOR_TYPE(Name, MangledName, Id, SingletonId) \ + case BuiltinType::Id: +#define SVE_TYPE(Name, Id, SingletonId) +#include "clang/Basic/AArch64SVEACLETypes.def" return true; default: return false; diff --git a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c index 45ea812750953..74daa7a98914f 100644 --- a/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c +++ b/clang/test/CodeGen/AArch64/builtin-shufflevector-fp8.c @@ -9,17 +9,17 @@ typedef __attribute__((neon_vector_type(16))) signed char int8x16_t; typedef __MFloat8x8_t mfloat8x8_t; typedef __MFloat8x16_t mfloat8x16_t; -// CHECK-LABEL: define dso_local <8 x i8> @f0( +// CHECK-LABEL: define dso_local <8 x i8> @test_8x8( // CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <8 x i8> [[X]], <8 x i8> [[X]], <8 x i32> // CHECK-NEXT: ret <8 x i8> [[SHUFFLE]] // -mfloat8x8_t f0(mfloat8x8_t x) { +mfloat8x8_t test_8x8(mfloat8x8_t x) { return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0); } -// CHECK-LABEL: define dso_local <8 x i8> @f1( +// CHECK-LABEL: define dso_local <8 x i8> @test_8x8_v( // CHECK-SAME: <8 x i8> [[X:%.*]], <8 x i8> noundef [[P:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MASK:%.*]] = and <8 x i8> [[P]], splat (i8 7) @@ -49,22 +49,22 @@ mfloat8x8_t f0(mfloat8x8_t x) { // CHECK-NEXT: [[SHUF_INS21:%.*]] = insertelement <8 x i8> [[SHUF_INS18]], i8 [[SHUF_ELT20]], i64 7 // CHECK-NEXT: ret <8 x i8> [[SHUF_INS21]] // -mfloat8x8_t f1(mfloat8x8_t x, int8x8_t p) { +mfloat8x8_t test_8x8_v(mfloat8x8_t x, int8x8_t p) { return __builtin_shufflevector(x, p); } -// CHECK-LABEL: define dso_local <16 x i8> @f3( +// CHECK-LABEL: define dso_local <16 x i8> @test_8x16( // CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[SHUFFLE:%.*]] = shufflevector <16 x i8> [[X]], <16 x i8> [[X]], <16 x i32> // CHECK-NEXT: ret <16 x i8> [[SHUFFLE]] // -mfloat8x16_t f3(mfloat8x16_t x) { +mfloat8x16_t test_8x16(mfloat8x16_t x) { return __builtin_shufflevector(x, x, 7, 6, 5, 4, 3, 2, 1, 0, 7, 6, 5, 4, 3, 2, 1, 0); } -// CHECK-LABEL: define dso_local <16 x i8> @f4( +// CHECK-LABEL: define dso_local <16 x i8> @test_8x16_v( // CHECK-SAME: <16 x i8> [[X:%.*]], <16 x i8> noundef [[P:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] // CHECK-NEXT: [[MASK:%.*]] = and <16 x i8> [[P]], splat (i8 15) @@ -118,6 +118,6 @@ mfloat8x16_t f3(mfloat8x16_t x) { // CHECK-NEXT: [[SHUF_INS45:%.*]] = insertelement <16 x i8> [[SHUF_INS42]], i8 [[SHUF_ELT44]], i64 15 // CHECK-NEXT: ret <16 x i8> [[SHUF_INS45]] // -mfloat8x16_t f4(mfloat8x16_t x, int8x16_t p) { +mfloat8x16_t test_8x16_v(mfloat8x16_t x, int8x16_t p) { return __builtin_shufflevector(x, p); } diff --git a/clang/test/Sema/builtin-shufflevector.c b/clang/test/Sema/builtin-shufflevector.c index c2dabb9d6585a..9094f51b9f7e5 100644 --- a/clang/test/Sema/builtin-shufflevector.c +++ b/clang/test/Sema/builtin-shufflevector.c @@ -6,7 +6,6 @@ typedef __attribute__((neon_vector_type(8))) signed char int8x8_t; typedef __attribute__((neon_vector_type(16))) signed char int8x16_t; typedef __MFloat8x8_t mfloat8x8_t; -typedef __MFloat8x16_t mfloat8x16_t; int8x8_t non_vector(int x) { return __builtin_shufflevector(x, x, 3, 2, 1, 0, 3, 2, 1, 0); From 42b36c55b3d17a12c89ce6d05f6dfe57a8513b24 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Thu, 9 Jan 2025 14:02:37 +0000 Subject: [PATCH 06/16] FP8 bitcast --- .../clang/Basic/DiagnosticSemaKinds.td | 2 + clang/include/clang/Sema/Sema.h | 5 + clang/lib/Sema/SemaCast.cpp | 22 ++ clang/lib/Sema/SemaExpr.cpp | 48 +++++ clang/test/CodeGen/AArch64/fp8-cast.c | 193 ++++++++++++++++++ clang/test/Sema/aarch64-fp8-cast.c | 102 +++++++++ 6 files changed, 372 insertions(+) create mode 100644 clang/test/CodeGen/AArch64/fp8-cast.c create mode 100644 clang/test/Sema/aarch64-fp8-cast.c diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td index b43f2d9143117..9ba671d0bbe78 100644 --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -3415,6 +3415,8 @@ def err_typecheck_vector_not_convertable : Error< "cannot convert between vector values of different size (%0 and %1)">; def err_typecheck_vector_not_convertable_non_scalar : Error< "cannot convert between vector and non-scalar values (%0 and %1)">; +def err_typecheck_vector_not_convertable_non_vector : Error< + "cannot convert between vector type %0 and non-vector type %1">; def err_typecheck_vector_lengths_not_equal : Error< "vector operands do not have the same number of elements (%0 and %1)">; def warn_typecheck_vector_element_sizes_not_equal : Warning< diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h index a41f16f6dc8c9..8751e016b96fb 100644 --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -7429,6 +7429,11 @@ class Sema final : public SemaBase { /// the perspective of SVE bitcasts. bool isValidSveBitcast(QualType srcType, QualType destType); + /// Check for bitcast beween a regular vector type and builtin Neon vector + /// type. + bool isValidNeonVectorBuiltinTypeBitcast(SourceRange OpRange, QualType SrcTy, + QualType DstTy); + /// Are the two types matrix types and do they have the same dimensions i.e. /// do they have the same number of rows and the same number of columns? bool areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy); diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index f98857f852b5a..a619dd556c51c 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -2386,6 +2386,16 @@ static TryCastResult TryReinterpretCast(Sema &Self, ExprResult &SrcExpr, return TC_Success; } + // Allow bitcasting between a regular vector type or a scalar, and a builtin + // Neon vector type. + if ((SrcType->isNeonVectorBuiltinType() || + DestType->isNeonVectorBuiltinType())) { + if (!Self.isValidNeonVectorBuiltinTypeBitcast(OpRange, SrcType, DestType)) + return TC_Failed; + Kind = CK_BitCast; + return TC_Success; + } + // Allow reinterpret_casts between vectors of the same size and // between vectors and integers of the same size. bool destIsVector = DestType->isVectorType(); @@ -3009,6 +3019,18 @@ void CastOperation::CheckCStyleCast() { return; } + // Allow bitcasting between a regular vector type and a builtin Neon vector + // type. + if (SrcType->isNeonVectorBuiltinType() || + DestType->isNeonVectorBuiltinType()) { + if (!Self.isValidNeonVectorBuiltinTypeBitcast(OpRange, SrcType, DestType)) { + SrcExpr = ExprError(); + return; + } + Kind = CK_BitCast; + return; + } + // Allow bitcasting between compatible RVV vector types. if ((SrcType->isVectorType() || DestType->isVectorType()) && Self.RISCV().isValidRVVBitcast(SrcType, DestType)) { diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp index ae40895980d90..1fc621ac490b6 100644 --- a/clang/lib/Sema/SemaExpr.cpp +++ b/clang/lib/Sema/SemaExpr.cpp @@ -7530,6 +7530,54 @@ bool Sema::isValidSveBitcast(QualType srcTy, QualType destTy) { ValidScalableConversion(destTy, srcTy); } +bool Sema::isValidNeonVectorBuiltinTypeBitcast(SourceRange OpRange, + QualType SrcTy, QualType DstTy) { + assert(SrcTy->isNeonVectorBuiltinType() || DstTy->isNeonVectorBuiltinType()); + + auto checkCast = [&](QualType BT, QualType OT) -> unsigned { + if (OT->isNeonVectorBuiltinType()) { + if (BT.getCanonicalType() != OT.getCanonicalType()) + return diag::err_invalid_conversion_between_vectors; + return 0; + } + + if (!OT->isVectorType()) { + if (!OT->isScalarType()) + return diag::err_typecheck_vector_not_convertable_non_scalar; + + if (!OT->isIntegralType(Context)) + return diag::err_typecheck_vector_not_convertable_non_vector; + } + + uint64_t OTLen; + QualType OTEltTy; + if (const VectorType *VT = OT->getAs()) { + OTLen = VT->getNumElements(); + OTEltTy = VT->getElementType(); + } else { + OTLen = 1; + OTEltTy = OT; + } + + uint64_t BTLen = + BT->getAs()->getKind() == BuiltinType::MFloat8x8 ? 8u + : 16u; + if (BTLen * 8u != OTLen * Context.getTypeSize(OTEltTy)) + return diag::err_invalid_conversion_between_vectors; + return 0; + }; + + QualType BT = SrcTy; + QualType OT = DstTy; + if (!SrcTy->isNeonVectorBuiltinType()) + std::swap(BT, OT); + + if (unsigned msg = checkCast(BT, OT)) + return Diag(OpRange.getBegin(), msg) << BT << OT << OpRange; + + return true; +} + bool Sema::areMatrixTypesOfTheSameDimension(QualType srcTy, QualType destTy) { if (!destTy->isMatrixType() || !srcTy->isMatrixType()) return false; diff --git a/clang/test/CodeGen/AArch64/fp8-cast.c b/clang/test/CodeGen/AArch64/fp8-cast.c new file mode 100644 index 0000000000000..c3a06b0d45157 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-cast.c @@ -0,0 +1,193 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -disable-O0-optnone -Werror -Wall -S -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +// Bitcast between FP8 Neon vectors +// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f8( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_f8u13__MFloat8x8_t( +// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <8 x i8> [[X]] +// +mfloat8x8_t test_f8_f8(mfloat8x8_t x) { + return (mfloat8x8_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_f8u14__MFloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <16 x i8> [[X]] +// +mfloat8x16_t testq_f8_f8(mfloat8x16_t x) { + return (mfloat8x16_t) x; +} + +// Bitcast between FP8 and int8 Neon vectors +// CHECK-LABEL: define dso_local <8 x i8> @test_f8_s8( +// CHECK-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z10test_f8_s810__Int8x8_t( +// CHECK-CXX-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <8 x i8> [[X]] +// +mfloat8x8_t test_f8_s8(int8x8_t x) { + return (mfloat8x8_t) x; +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_s8_f8( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <8 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x i8> @_Z10test_s8_f8u13__MFloat8x8_t( +// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <8 x i8> [[X]] +// +int8x8_t test_s8_f8(mfloat8x8_t x) { + return (int8x8_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_s8( +// CHECK-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z11testq_f8_s811__Int8x16_t( +// CHECK-CXX-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <16 x i8> [[X]] +// +mfloat8x16_t testq_f8_s8(int8x16_t x) { + return (mfloat8x16_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_s8_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: ret <16 x i8> [[X]] +// +// CHECK-CXX-LABEL: define dso_local noundef <16 x i8> @_Z11testq_s8_f8u14__MFloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: ret <16 x i8> [[X]] +// +int8x16_t testq_s8_f8(mfloat8x16_t x) { + return (int8x16_t) x; +} + +// Bitcast between FP8 and float32 Neon vectors +// CHECK-LABEL: define dso_local <8 x i8> @test_f8_f32( +// CHECK-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8> +// CHECK-NEXT: ret <8 x i8> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z11test_f8_f3213__Float32x2_t( +// CHECK-CXX-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <2 x float> [[X]] to <8 x i8> +// CHECK-CXX-NEXT: ret <8 x i8> [[TMP0]] +// +mfloat8x8_t test_f8_f32(float32x2_t x) { + return (mfloat8x8_t) x; +} + +// CHECK-LABEL: define dso_local <2 x float> @test_f32_f8( +// CHECK-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float> +// CHECK-NEXT: ret <2 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z11test_f32_f8u13__MFloat8x8_t( +// CHECK-CXX-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x i8> [[X]] to <2 x float> +// CHECK-CXX-NEXT: ret <2 x float> [[TMP0]] +// +float32x2_t test_f32_f8(mfloat8x8_t x) { + return (float32x2_t) x; +} + +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_f32( +// CHECK-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z12testq_f8_f3213__Float32x4_t( +// CHECK-CXX-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[X]] to <16 x i8> +// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// +mfloat8x16_t testq_f8_f32(float32x4_t x) { + return (mfloat8x16_t) x; +} + +// CHECK-LABEL: define dso_local <4 x float> @testq_f32_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float> +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z12testq_f32_f8u14__MFloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to <4 x float> +// CHECK-CXX-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t testq_f32_f8(mfloat8x16_t x) { + return (float32x4_t) x; +} + +// Bitcast between FP8 and poly128_t (which is integral) +// CHECK-LABEL: define dso_local <16 x i8> @testq_f8_p128( +// CHECK-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8> +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z13testq_f8_p128o( +// CHECK-CXX-SAME: i128 noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast i128 [[X]] to <16 x i8> +// CHECK-CXX-NEXT: ret <16 x i8> [[TMP0]] +// +mfloat8x16_t testq_f8_p128(poly128_t x) { + return (mfloat8x16_t) x; +} + +// CHECK-LABEL: define dso_local i128 @testq_p128_f8( +// CHECK-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128 +// CHECK-NEXT: ret i128 [[TMP0]] +// +// CHECK-CXX-LABEL: define dso_local noundef i128 @_Z13testq_p128_f8u14__MFloat8x16_t( +// CHECK-CXX-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <16 x i8> [[X]] to i128 +// CHECK-CXX-NEXT: ret i128 [[TMP0]] +// +poly128_t testq_p128_f8(mfloat8x16_t x) { + return (poly128_t) x; +} diff --git a/clang/test/Sema/aarch64-fp8-cast.c b/clang/test/Sema/aarch64-fp8-cast.c new file mode 100644 index 0000000000000..a7b7d2a4bfafd --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-cast.c @@ -0,0 +1,102 @@ +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -verify -emit-llvm -o - %s + +// REQUIRES: aarch64-registered-target + +// Bitcast between FP8 Neon vectors +mfloat8x8_t err_test_f8_f8(mfloat8x16_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'mfloat8x8_t' (aka '__MFloat8x8_t') of different size}} +} + +mfloat8x16_t err_testq_f8_f8(mfloat8x8_t x) { + return (mfloat8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'mfloat8x16_t' (aka '__MFloat8x16_t') of different size}} +} + +// Bitcast between FP8 and int8 Neon vectors +mfloat8x8_t err_test_f8_s8(int8x16_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'int8x16_t' (vector of 16 'int8_t' values) of different size}} +} + +int8x8_t err_test_s8_f8(mfloat8x16_t x) { + return (int8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'int8x8_t' (vector of 8 'int8_t' values) of different size}} +} + +mfloat8x16_t err_testq_f8_s8(int8x8_t x) { + return (mfloat8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'int8x8_t' (vector of 8 'int8_t' values) of different size}} +} + +int8x16_t err_testq_s8_f8(mfloat8x8_t x) { + return (int8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'int8x16_t' (vector of 16 'int8_t' values) of different size}} +} + +// Bitcast between FP8 and float32 Neon vectors +mfloat8x8_t err_test_f8_f32(float32x4_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'float32x4_t' (vector of 4 'float32_t' values) of different size}} +} + +float32x2_t err_test_f32_f8(mfloat8x16_t x) { + return (float32x2_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'float32x2_t' (vector of 2 'float32_t' values) of different size}} +} + +mfloat8x16_t err_testq_f8_f32(float32x2_t x) { + return (mfloat8x16_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x16_t' (aka '__MFloat8x16_t') and 'float32x2_t' (vector of 2 'float32_t' values) of different size}} +} + +float32x4_t err_testq_f32_f8(mfloat8x8_t x) { + return (float32x4_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'float32x4_t' (vector of 4 'float32_t' values) of different size}} +} + +// Bitcast between FP8 and poly128_t (which is integral) +mfloat8x8_t err_testq_f8_p128(poly128_t x) { + return (mfloat8x8_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'poly128_t' (aka 'unsigned __int128') of different size}} +} + +poly128_t err_testq_p128_f8(mfloat8x8_t x) { + return (poly128_t) x; +// expected-error@-1 {{invalid conversion between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and 'poly128_t' (aka 'unsigned __int128') of different size}} +} + +// Bitcast between FP8 and a non-integral type +mfloat8x8_t err_test_f8_ptr(void *p) { + return (mfloat8x8_t) p; +// expected-error@-1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'void *'}} +} + +void *err_test_ptr_f8(mfloat8x8_t v) { + return (void *) v; +// expected-error@-1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'void *'}} +} + +mfloat8x8_t err_test_f8_dbl(double v) { + return (mfloat8x8_t) v; +// expected-error@-1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'double'}} +} + +double err_test_dbl_f8(mfloat8x8_t v) { + return (double) v; +// expected-error@-1 {{cannot convert between vector type 'mfloat8x8_t' (aka '__MFloat8x8_t') and non-vector type 'double'}} +} + +struct S { + char ch[16]; +}; + +mfloat8x16_t err_test_f8_agg(struct S s) { + return (mfloat8x16_t) s; +// expected-error@-1 {{cannot convert between vector and non-scalar values ('mfloat8x16_t' (aka '__MFloat8x16_t') and 'struct S')}} +} + +struct S err_test_agg_f8(mfloat8x16_t v) { + return (struct S) v; +// expected-error@-1 {{cannot convert between vector and non-scalar values ('mfloat8x16_t' (aka '__MFloat8x16_t') and 'struct S')}} +} From de28a7692f27d9d2ecac57b308e0da38f5f335af Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Fri, 6 Dec 2024 19:24:16 +0000 Subject: [PATCH 07/16] [AArch64] Add Neon FP8 conversion intrinsics --- clang/include/clang/Basic/arm_neon.td | 23 ++++ clang/include/clang/Basic/arm_neon_incl.td | 2 + clang/lib/CodeGen/CGBuiltin.cpp | 120 +++++++++++++++++- clang/lib/CodeGen/CodeGenFunction.h | 3 + clang/utils/TableGen/NeonEmitter.cpp | 22 +++- llvm/include/llvm/IR/IntrinsicsAArch64.td | 22 ++++ .../lib/Target/AArch64/AArch64InstrFormats.td | 46 +++++-- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 +- 8 files changed, 230 insertions(+), 22 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index ef89fa4358dfe..b79b4749371b5 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2125,6 +2125,29 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { } } +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in { + def VBF1CVT_BF16_MF8 : VInst<"vcvt1_bf16_mf8_fpm", "(QB).V", "m">; + def VBF1CVT_LOW_BF16_MF8 : VInst<"vcvt1_low_bf16_mf8_fpm", "B.V", "Qm">; + def VBF2CVTL_BF16_MF8 : VInst<"vcvt2_bf16_mf8_fpm", "(QB).V", "m">; + def VBF2CVTL_LOW_BF16_MF8 : VInst<"vcvt2_low_bf16_mf8_fpm", "B.V", "Qm">; + def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V", "Qm">; + def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V", "Qm">; +} + +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { + def VF1CVT_F16_MF8 : VInst<"vcvt1_f16_mf8_fpm", "(>QF).V", "m">; + def VF1CVT_LOW_F16_MF8 : VInst<"vcvt1_low_f16_mf8_fpm", "(>F).V", "Qm">; + def VF2CVTL_F16_MF8 : VInst<"vcvt2_f16_mf8_fpm", "(>QF).V", "m">; + def VF2CVTL_LOW_F16_MF8 : VInst<"vcvt2_low_f16_mf8_fpm", "(>F).V", "Qm">; + def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V", "Qm">; + def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V", "Qm">; + + def VCVTN_LOW_F8_F32 : VInst<"vcvt_mf8_f32_fpm", ".(>>QF)(>>QF)V", "m">; + def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Qm">; + def VCVTN_F8_F16 : VInst<"vcvt_mf8_f16_fpm", ".(>F)(>F)V", "m">; + def VCVTNQ_F8_F16 : VInst<"vcvtq_mf8_f16_fpm", ".(>F)(>F)V", "Qm">; +} + let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in { def FAMIN : WInst<"vamin", "...", "fhQdQfQh">; def FAMAX : WInst<"vamax", "...", "fhQdQfQh">; diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td index fd800e5a6278e..91a2bf3020b9a 100644 --- a/clang/include/clang/Basic/arm_neon_incl.td +++ b/clang/include/clang/Basic/arm_neon_incl.td @@ -243,6 +243,7 @@ def OP_UNAVAILABLE : Operation { // B: change to BFloat16 // P: change to polynomial category. // p: change polynomial to equivalent integer category. Otherwise nop. +// V: change to fpm_t // // >: double element width (vector size unchanged). // <: half element width (vector size unchanged). @@ -301,6 +302,7 @@ class Inst ch = []>{ class SInst ch = []> : Inst {} class IInst ch = []> : Inst {} class WInst ch = []> : Inst {} +class VInst : Inst {} // The following instruction classes are implemented via operators // instead of builtins. As such these declarations are only used for diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ca03fb665d423..ce7b2620cba9c 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6898,6 +6898,13 @@ Value *CodeGenFunction::EmitNeonCall(Function *F, SmallVectorImpl &Ops, return Builder.CreateCall(F, Ops, name); } +Value *CodeGenFunction::EmitFP8NeonCall(Function *F, + SmallVectorImpl &Ops, + Value *FPM, const char *name) { + Builder.CreateCall(CGM.getIntrinsic(Intrinsic::aarch64_set_fpmr), FPM); + return EmitNeonCall(F, Ops, name); +} + Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty, bool neg) { int SV = cast(V)->getSExtValue(); @@ -14054,7 +14061,118 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); } - + case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: { + Int = Intrinsic::aarch64_neon_fp8_cvtl1; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8); + // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of + // the vector. + if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) { + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false)); + Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); + } else + Tys[1] = Ops[0]->getType(); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1"); + } + case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: { + Int = Intrinsic::aarch64_neon_fp8_cvtl2; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8); + // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower + // part of the vector. + if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) { + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false)); + Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); + } else + Tys[1] = Ops[0]->getType(); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2"); + } + case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: { + Int = Intrinsic::aarch64_neon_fp8_cvtl1; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(HalfTy, 8); + // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower + // part of the vector. + if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) { + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false)); + Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); + } else + Tys[1] = Ops[0]->getType(); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1"); + } + case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: { + Int = Intrinsic::aarch64_neon_fp8_cvtl2; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(HalfTy, 8); + // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower + // part of the vector. + if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) { + Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, + /*isQuad*/ false)); + Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); + } else + Tys[1] = Ops[0]->getType(); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2"); + } + case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: { + Int = Intrinsic::aarch64_neon_fp8_fcvtn; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8); + Tys[1] = Ops[0]->getType(); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn"); + } + case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: { + Int = Intrinsic::aarch64_neon_fp8_fcvtn; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8); + // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t + Tys[1] = llvm::FixedVectorType::get(HalfTy, 4); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn"); + } + case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: { + Int = Intrinsic::aarch64_neon_fp8_fcvtn; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16); + // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t + Tys[1] = llvm::FixedVectorType::get(HalfTy, 8); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn"); + } + case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: { + Int = Intrinsic::aarch64_neon_fp8_fcvtn2; + llvm::Type *Tys[2]; + Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16); + Tys[1] = Ops[1]->getType(); + Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]), + Ops[0], Builder.getInt64(0)); + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2"); + } case NEON::BI__builtin_neon_vamin_f16: case NEON::BI__builtin_neon_vaminq_f16: case NEON::BI__builtin_neon_vamin_f32: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 457e1477bb2ee..a9d5462a13201 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4682,6 +4682,9 @@ class CodeGenFunction : public CodeGenTypeCache { SmallVectorImpl &O, const char *name, unsigned shift = 0, bool rightshift = false); + llvm::Value *EmitFP8NeonCall(llvm::Function *F, + SmallVectorImpl &O, + llvm::Value *FPM, const char *name); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, const llvm::ElementCount &Count); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index d7d649dd2456d..61067937acf72 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -74,6 +74,7 @@ enum ClassKind { ClassI, // generic integer instruction, e.g., "i8" suffix ClassS, // signed/unsigned/poly, e.g., "s8", "u8" or "p8" suffix ClassW, // width-specific instruction, e.g., "8" suffix + ClassV, // void-suffix instruction, no suffix ClassB, // bitcast arguments with enum argument to specify type ClassL, // Logical instructions which are op instructions // but we need to not emit any suffix for in our @@ -144,7 +145,7 @@ class Type { private: TypeSpec TS; - enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8 }; + enum TypeKind { Void, Float, SInt, UInt, Poly, BFloat16, MFloat8, FPM }; TypeKind Kind; bool Immediate, Constant, Pointer; // ScalarForMangling and NoManglingQ are really not suited to live here as @@ -198,6 +199,7 @@ class Type { bool isVoid() const { return Kind == Void; } bool isBFloat16() const { return Kind == BFloat16; } bool isMFloat8() const { return Kind == MFloat8; } + bool isFPM() const { return Kind == FPM; } unsigned getNumElements() const { return Bitwidth / ElementBitwidth; } unsigned getSizeInBits() const { return Bitwidth; } unsigned getElementSizeInBits() const { return ElementBitwidth; } @@ -600,6 +602,7 @@ class NeonEmitter { const Record *SI = R.getClass("SInst"); const Record *II = R.getClass("IInst"); const Record *WI = R.getClass("WInst"); + const Record *VI = R.getClass("VInst"); const Record *SOpI = R.getClass("SOpInst"); const Record *IOpI = R.getClass("IOpInst"); const Record *WOpI = R.getClass("WOpInst"); @@ -609,6 +612,7 @@ class NeonEmitter { ClassMap[SI] = ClassS; ClassMap[II] = ClassI; ClassMap[WI] = ClassW; + ClassMap[VI] = ClassV; ClassMap[SOpI] = ClassS; ClassMap[IOpI] = ClassI; ClassMap[WOpI] = ClassW; @@ -641,6 +645,9 @@ class NeonEmitter { std::string Type::str() const { if (isVoid()) return "void"; + if (isFPM()) + return "fpm_t"; + std::string S; if (isInteger() && !isSigned()) @@ -699,6 +706,8 @@ std::string Type::builtin_str() const { } else if (isMFloat8()) { assert(ElementBitwidth == 8 && "MFloat8 can only be 8 bits"); S += "m"; + } else if (isFPM()) { + S += "UWi"; } else switch (ElementBitwidth) { case 16: S += "h"; break; @@ -888,6 +897,7 @@ void Type::applyTypespec(bool &Quad) { case 'm': Kind = MFloat8; ElementBitwidth = 8; + NoManglingQ = true; break; default: llvm_unreachable("Unhandled type code!"); @@ -925,6 +935,13 @@ void Type::applyModifiers(StringRef Mods) { case 'P': Kind = Poly; break; + case 'V': + Kind = FPM; + Bitwidth = ElementBitwidth = 64; + NumVectors = 0; + Immediate = Constant = Pointer = false; + ScalarForMangling = NoManglingQ = true; + break; case '>': assert(ElementBitwidth < 128); ElementBitwidth *= 2; @@ -1000,6 +1017,9 @@ std::string Intrinsic::getInstTypeCode(Type T, ClassKind CK) const { if (CK == ClassB && TargetGuard == "neon") return ""; + if (this->CK == ClassV) + return ""; + if (T.isBFloat16()) return "bf16"; diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index cc7a81e15f660..3f841f86f31d8 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1004,6 +1004,28 @@ def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>; def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>; def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>; + // + // Neon FP8 intrinsics + // + + // Conversions + class AdvSIMD_FP8_1VectorArg_Long_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], [llvm_anyvector_ty], [IntrReadMem, IntrInaccessibleMemOnly]>; + + def int_aarch64_neon_fp8_cvtl1 : AdvSIMD_FP8_1VectorArg_Long_Intrinsic; + def int_aarch64_neon_fp8_cvtl2 : AdvSIMD_FP8_1VectorArg_Long_Intrinsic; + + def int_aarch64_neon_fp8_fcvtn + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrReadMem, IntrInaccessibleMemOnly]>; + def int_aarch64_neon_fp8_fcvtn2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrReadMem, IntrInaccessibleMemOnly]>; } def llvm_nxv1i1_ty : LLVMType; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index f527f7e4eafbc..0ecc35f61903b 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6558,17 +6558,30 @@ class BaseSIMDThreeVectors size, bits<4> op, // FCVTN (FP16 to FP8) -multiclass SIMDThreeSameSizeVectorCvt { - def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">; - def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110, V128, V128, asm, ".16b", ".8h">; +multiclass SIMD_FP8_CVTN_F16 { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b01, 0b1110, V64, V64, asm, ".8b",".4h">; + def v16f8 : BaseSIMDThreeVectors<0b1, 0b0, 0b01, 0b1110, V128, V128, asm, ".16b", ".8h">; + } + def : Pat<(v8i8 (Op (v4f16 V64:$Rn), (v4f16 V64:$Rm))), + (!cast(NAME # v8f8) V64:$Rn, V64:$Rm)>; + def : Pat<(v16i8 (Op (v8f16 V128:$Rn), (v8f16 V128:$Rm))), + (!cast(NAME # v16f8) V128:$Rn, V128:$Rm)>; } -// TODO : Create v16f8 value type // FCVTN, FCVTN2 (FP32 to FP8) -multiclass SIMDThreeVectorCvt { - def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">; - def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s", - V128, v16i8, v4f32, null_frag>; +multiclass SIMD_FP8_CVTN_F32 { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v8f8 : BaseSIMDThreeVectors<0b0, 0b0, 0b00, 0b1110, V64, V128, asm, ".8b", ".4s">; + def 2v16f8 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1110, asm#2, ".16b", ".4s", + V128, v16i8, v4f32, null_frag>; + } + + def : Pat<(v8i8 (Op (v4f32 V128:$Rn), (v4f32 V128:$Rm))), + (!cast(NAME # v8f8) V128:$Rn, V128:$Rm)>; + + def : Pat<(v16i8 (!cast(Op # 2) (v16i8 V128:$_Rd), (v4f32 V128:$Rn), (v4f32 V128:$Rm))), + (!cast(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>; } // TODO: Create a new Value Type v8f8 and v16f8 @@ -7032,11 +7045,18 @@ multiclass SIMDMixedTwoVector opc, string asm, //---------------------------------------------------------------------------- // FP8 Advanced SIMD two-register miscellaneous //---------------------------------------------------------------------------- -multiclass SIMDMixedTwoVectorFP8sz, string asm> { - def v8f16 : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128, - asm, ".8h", ".8b", []>; - def 2v8f16 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128, - asm#2, ".8h", ".16b", []>; +multiclass SIMD_FP8_CVTLsz, string asm, ValueType dty, SDPatternOperator Op> { + let Uses=[FPMR, FPCR], mayLoad = 1 in { + def NAME : BaseSIMDMixedTwoVector<0b0, 0b1, sz, 0b10111, V64, V128, + asm, ".8h", ".8b", []>; + def NAME#2 : BaseSIMDMixedTwoVector<0b1, 0b1, sz, 0b10111, V128, V128, + asm#2, ".8h", ".16b", []>; + } + def : Pat<(dty (Op (v8i8 V64:$Rn))), + (!cast(NAME) V64:$Rn)>; + + def : Pat<(dty (Op (v16i8 V128:$Rn))), + (!cast(NAME#2) V128:$Rn)>; } class BaseSIMDCmpTwoVector size, bits<2> size2, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index c6f5cdcd1d5fe..b415d843ac2e2 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -10301,13 +10301,13 @@ let Predicates = [HasD128] in { // 2023 Architecture Extensions: //===----------------------------===// -let Uses = [FPMR, FPCR], Predicates = [HasFP8] in { - defm F1CVTL : SIMDMixedTwoVectorFP8<0b00, "f1cvtl">; - defm F2CVTL : SIMDMixedTwoVectorFP8<0b01, "f2cvtl">; - defm BF1CVTL : SIMDMixedTwoVectorFP8<0b10, "bf1cvtl">; - defm BF2CVTL : SIMDMixedTwoVectorFP8<0b11, "bf2cvtl">; - defm FCVTN_F16_F8 : SIMDThreeSameSizeVectorCvt<"fcvtn">; - defm FCVTN_F32_F8 : SIMDThreeVectorCvt<"fcvtn">; +let Predicates = [HasFP8] in { + defm F1CVTL : SIMD_FP8_CVTL<0b00, "f1cvtl", v8f16, int_aarch64_neon_fp8_cvtl1>; + defm F2CVTL : SIMD_FP8_CVTL<0b01, "f2cvtl", v8f16, int_aarch64_neon_fp8_cvtl2>; + defm BF1CVTL : SIMD_FP8_CVTL<0b10, "bf1cvtl", v8bf16, int_aarch64_neon_fp8_cvtl1>; + defm BF2CVTL : SIMD_FP8_CVTL<0b11, "bf2cvtl", v8bf16, int_aarch64_neon_fp8_cvtl2>; + defm FCVTN_F16 : SIMD_FP8_CVTN_F16<"fcvtn", int_aarch64_neon_fp8_fcvtn>; + defm FCVTN_F32 : SIMD_FP8_CVTN_F32<"fcvtn", int_aarch64_neon_fp8_fcvtn>; defm FSCALE : SIMDThreeVectorFscale<0b1, 0b1, 0b111, "fscale", int_aarch64_neon_fp8_fscale>; } // End let Predicates = [HasFP8] From 66730cd674366d5dc67d2ce490272b624263f3d6 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 10 Dec 2024 18:14:02 +0000 Subject: [PATCH 08/16] [fixup] Add tests, fix calling the wrong LLVM intrinsic --- clang/lib/CodeGen/CGBuiltin.cpp | 4 +- .../fp8-intrinsics/acle_neon_fp8_cvt.c | 308 ++++++++++++++++++ llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll | 112 +++++++ 3 files changed, 422 insertions(+), 2 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c create mode 100644 llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index ce7b2620cba9c..66501b0e365ae 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -14105,7 +14105,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Tys[0] = llvm::FixedVectorType::get(HalfTy, 8); // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower // part of the vector. - if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) { + if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) { Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, /*isQuad*/ false)); Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); @@ -14123,7 +14123,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Tys[0] = llvm::FixedVectorType::get(HalfTy, 8); // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower // part of the vector. - if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) { + if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) { Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, /*isQuad*/ false)); Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c new file mode 100644 index 0000000000000..7543938f48710 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c @@ -0,0 +1,308 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s + +// REQUIRES: aarch64-registered-target + +#include + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpmu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt1_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_low_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpmu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt2_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_low_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] +// +bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_high_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] +// +bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_high_bf16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpmu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt1_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_low_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm( +// CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpmu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { + return vcvt2_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_low_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] +// +float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt1_high_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm( +// CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpmu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] +// +float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { + return vcvt2_high_f16_mf8_fpm(op, fpm); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm( +// CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-NEXT: ret <8 x i8> [[VFCVTN_I]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m( +// CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN_I]] +// +mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) { + return vcvt_mf8_f32_fpm(vn, vm, fpm); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm( +// CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpmu13__MFloat8x8_t13__Float32x4_tS_m( +// CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn, + float32x4_t vm, fpm_t fpm) { + return vcvt_high_mf8_f32_fpm(vd, vn, vm, fpm); +} + +// CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm( +// CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-NEXT: ret <8 x i8> [[VFCVTN2_I]] +// +// CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m( +// CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN2_I]] +// +mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) { + return vcvt_mf8_f16_fpm(vn, vm, fpm); +} + +// CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm( +// CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +// CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m( +// CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]] +// +mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) { + return vcvtq_mf8_f16_fpm(vn, vm, fpm); +} diff --git a/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll new file mode 100644 index 0000000000000..6070380d24234 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/neon-fp8-cvt.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8 < %s | FileCheck %s + +define <8 x bfloat> @test_vbfcvtl1_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl1_low: +; CHECK: // %bb.0: +; CHECK-NEXT: bf1cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> %vn) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_vbfcvtl1_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl1_high: +; CHECK: // %bb.0: +; CHECK-NEXT: bf1cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> %vn) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_vbfcvtl2_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl2_low: +; CHECK: // %bb.0: +; CHECK-NEXT: bf2cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> %vn) + ret <8 x bfloat> %res +} + +define <8 x bfloat> @test_vbfcvtl2_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vbfcvtl2_high: +; CHECK: // %bb.0: +; CHECK-NEXT: bf2cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> %vn) + ret <8 x bfloat> %res +} + + +define <8 x half> @test_vfcvtl1_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl1_low: +; CHECK: // %bb.0: +; CHECK-NEXT: f1cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> %vn) + ret <8 x half> %res +} + +define <8 x half> @test_vfcvtl1_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl1_high: +; CHECK: // %bb.0: +; CHECK-NEXT: f1cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> %vn) + ret <8 x half> %res +} + +define <8 x half> @test_vfcvtl2_low(<8 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl2_low: +; CHECK: // %bb.0: +; CHECK-NEXT: f2cvtl v0.8h, v0.8b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> %vn) + ret <8 x half> %res +} + +define <8 x half> @test_vfcvtl2_high(<16 x i8> %vn) { +; CHECK-LABEL: test_vfcvtl2_high: +; CHECK: // %bb.0: +; CHECK-NEXT: f2cvtl2 v0.8h, v0.16b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> %vn) + ret <8 x half> %res +} + +define <8 x i8> @test_vcvtn_low_f8_f32(<4 x float> %vn, <4 x float> %vm) { +; CHECK-LABEL: test_vcvtn_low_f8_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn v0.8b, v0.4s, v1.4s +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> %vn, <4 x float> %vm) + ret <8 x i8> %res +} + +define <16 x i8> @test_vcvtn_high_f8_f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) { +; CHECK-LABEL: test_vcvtn_high_f8_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn2 v0.16b, v1.4s, v2.4s +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> %vd, <4 x float> %vn, <4 x float> %vm) + ret <16 x i8> %res +} + + +define <8 x i8> @test_vcvtn_f8_f16(<4 x half> %vn, <4 x half> %vm) { +; CHECK-LABEL: test_vcvtn_f8_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn v0.8b, v0.4h, v1.4h +; CHECK-NEXT: ret + %res = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> %vn, <4 x half> %vm) + ret <8 x i8> %res +} + +define <16 x i8> @test_vcvtn2_f8_f16(<8 x half> %vn, <8 x half> %vm) { +; CHECK-LABEL: test_vcvtn2_f8_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fcvtn v0.16b, v0.8h, v1.8h +; CHECK-NEXT: ret + %res = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> %vn, <8 x half> %vm) + ret <16 x i8> %res +} From f7436eff1ede039b89b1c1c089a39a9808993be4 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 17 Dec 2024 10:49:19 +0000 Subject: [PATCH 09/16] [fixup] Refector much of common code into a helper function (NFC) --- clang/lib/CodeGen/CGBuiltin.cpp | 171 +++++++++++----------------- clang/lib/CodeGen/CodeGenFunction.h | 4 + 2 files changed, 69 insertions(+), 106 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 66501b0e365ae..f0e94a76fca6f 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6911,6 +6911,23 @@ Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty, return ConstantInt::get(Ty, neg ? -SV : SV); } +Value *CodeGenFunction::EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0, + llvm::Type *Ty1, bool Extract, + SmallVectorImpl &Ops, + const CallExpr *E, + const char *name) { + llvm::Type *Tys[] = {Ty0, Ty1}; + if (Extract) { + // Op[0] is mfloat8x16_t, but the intrinsic converts only the lower part of + // the vector. + Tys[1] = llvm::FixedVectorType::get(Int8Ty, 8); + Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); + } + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name); +} + // Right-shift a vector by a constant. Value *CodeGenFunction::EmitNeonRShiftImm(Value *Vec, Value *Shift, llvm::Type *Ty, bool usgn, @@ -12847,6 +12864,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return V; unsigned Int; + bool ExtractLow = false; switch (BuiltinID) { default: return nullptr; case NEON::BI__builtin_neon_vbsl_v: @@ -14061,117 +14079,58 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, Int = Intrinsic::aarch64_neon_vluti4q_laneq_x2; return EmitNeonCall(CGM.getIntrinsic(Int, Ty), Ops, "vluti4q_laneq_x2"); } - case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm: case NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm: - case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: { - Int = Intrinsic::aarch64_neon_fp8_cvtl1; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8); - // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower part of - // the vector. - if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_bf16_mf8_fpm) { - Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, - /*isQuad*/ false)); - Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); - } else - Tys[1] = Ops[0]->getType(); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1"); - } - case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm: + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt1_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_high_bf16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, + llvm::FixedVectorType::get(BFloatTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); case NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm: - case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: { - Int = Intrinsic::aarch64_neon_fp8_cvtl2; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(BFloatTy, 8); - // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower - // part of the vector. - if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_bf16_mf8_fpm) { - Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, - /*isQuad*/ false)); - Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); - } else - Tys[1] = Ops[0]->getType(); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2"); - } - case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm: + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt2_bf16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_high_bf16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, + llvm::FixedVectorType::get(BFloatTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2"); case NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm: - case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: { - Int = Intrinsic::aarch64_neon_fp8_cvtl1; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(HalfTy, 8); - // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower - // part of the vector. - if (BuiltinID == NEON::BI__builtin_neon_vcvt1_low_f16_mf8_fpm) { - Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, - /*isQuad*/ false)); - Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); - } else - Tys[1] = Ops[0]->getType(); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt1"); - } - case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm: + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt1_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt1_high_f16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl1, + llvm::FixedVectorType::get(HalfTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt1"); case NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm: - case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: { - Int = Intrinsic::aarch64_neon_fp8_cvtl2; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(HalfTy, 8); - // Op[1] is mfloat8x16_t, but the intrinsic converts only the lower - // part of the vector. - if (BuiltinID == NEON::BI__builtin_neon_vcvt2_low_f16_mf8_fpm) { - Tys[1] = GetNeonType(this, NeonTypeFlags(Type.getEltType(), false, - /*isQuad*/ false)); - Ops[0] = Builder.CreateExtractVector(Tys[1], Ops[0], Builder.getInt64(0)); - } else - Tys[1] = Ops[0]->getType(); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vbfcvt2"); - } - case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: { - Int = Intrinsic::aarch64_neon_fp8_fcvtn; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8); - Tys[1] = Ops[0]->getType(); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn"); - } - case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: { - Int = Intrinsic::aarch64_neon_fp8_fcvtn; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(Int8Ty, 8); - // Gets the expected type, because arm_neon.h casts float16x4_t to int8x8_t - Tys[1] = llvm::FixedVectorType::get(HalfTy, 4); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn"); - } - case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: { - Int = Intrinsic::aarch64_neon_fp8_fcvtn; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16); - // Gets the expected type, because arm_neon.h casts float16x8_t to int8x16_t - Tys[1] = llvm::FixedVectorType::get(HalfTy, 8); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn"); - } + ExtractLow = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vcvt2_f16_mf8_fpm: + case NEON::BI__builtin_neon_vcvt2_high_f16_mf8_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_cvtl2, + llvm::FixedVectorType::get(HalfTy, 8), + Ops[0]->getType(), ExtractLow, Ops, E, "vbfcvt2"); + case NEON::BI__builtin_neon_vcvt_mf8_f32_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, + llvm::FixedVectorType::get(Int8Ty, 8), + Ops[0]->getType(), false, Ops, E, "vfcvtn"); + case NEON::BI__builtin_neon_vcvt_mf8_f16_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, + llvm::FixedVectorType::get(Int8Ty, 8), + llvm::FixedVectorType::get(HalfTy, 4), false, Ops, + E, "vfcvtn"); + case NEON::BI__builtin_neon_vcvtq_mf8_f16_fpm: + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn, + llvm::FixedVectorType::get(Int8Ty, 16), + llvm::FixedVectorType::get(HalfTy, 8), false, Ops, + E, "vfcvtn"); case NEON::BI__builtin_neon_vcvt_high_mf8_f32_fpm: { - Int = Intrinsic::aarch64_neon_fp8_fcvtn2; - llvm::Type *Tys[2]; - Tys[0] = llvm::FixedVectorType::get(Int8Ty, 16); - Tys[1] = Ops[1]->getType(); - Ops[0] = Builder.CreateInsertVector(Tys[0], PoisonValue::get(Tys[0]), - Ops[0], Builder.getInt64(0)); - llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); - return EmitFP8NeonCall(CGM.getIntrinsic(Int, Tys), Ops, FPM, "vfcvtn2"); + llvm::Type *Ty = llvm::FixedVectorType::get(Int8Ty, 16); + Ops[0] = Builder.CreateInsertVector(Ty, PoisonValue::get(Ty), Ops[0], + Builder.getInt64(0)); + return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, + Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2"); } case NEON::BI__builtin_neon_vamin_f16: case NEON::BI__builtin_neon_vaminq_f16: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index a9d5462a13201..010b44f4d38d9 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4685,6 +4685,10 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitFP8NeonCall(llvm::Function *F, SmallVectorImpl &O, llvm::Value *FPM, const char *name); + llvm::Value *EmitFP8NeonCvtCall(unsigned IID, llvm::Type *Ty0, + llvm::Type *Ty1, bool Extract, + SmallVectorImpl &Ops, + const CallExpr *E, const char *name); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, const llvm::ElementCount &Count); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); From 67627ef46ce5d98629fc4f98815d4e5dd47a95ef Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 17 Dec 2024 11:28:49 +0000 Subject: [PATCH 10/16] [fixup] Add target features test, remove redundant bf16 guard --- clang/include/clang/Basic/arm_neon.td | 2 +- .../acle_neon_fp8_cvt.c | 43 +++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index b79b4749371b5..968536bcf5186 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2125,7 +2125,7 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { } } -let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,bf16,neon" in { +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { def VBF1CVT_BF16_MF8 : VInst<"vcvt1_bf16_mf8_fpm", "(QB).V", "m">; def VBF1CVT_LOW_BF16_MF8 : VInst<"vcvt1_low_bf16_mf8_fpm", "B.V", "Qm">; def VBF2CVTL_BF16_MF8 : VInst<"vcvt2_bf16_mf8_fpm", "(QB).V", "m">; diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c new file mode 100644 index 0000000000000..2c7004c7968a4 --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_cvt.c @@ -0,0 +1,43 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -emit-llvm -verify %s -o /dev/null + +// REQUIRES: aarch64-registered-target + +#include + +void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, + mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) { + (void) vcvt1_bf16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt1_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt1_low_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_low_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_bf16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt2_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_low_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_low_bf16_mf8_fpm' requires target feature 'fp8'}} + + (void) vcvt1_high_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_high_bf16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_high_bf16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_high_bf16_mf8_fpm' requires target feature 'fp8'}} + + (void) vcvt1_f16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt1_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt1_low_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_low_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_f16_mf8_fpm(v8, fpm); + // expected-error@-1 {{'vcvt2_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_low_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_low_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt1_high_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt1_high_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt2_high_f16_mf8_fpm(v16, fpm); + // expected-error@-1 {{'vcvt2_high_f16_mf8_fpm' requires target feature 'fp8'}} + (void) vcvt_mf8_f32_fpm(va4, va4, fpm); + // expected-error@-1 {{'vcvt_mf8_f32_fpm' requires target feature 'fp8'}} + (void) vcvt_high_mf8_f32_fpm(v8, va4, va4, fpm); + // expected-error@-1 {{'vcvt_high_mf8_f32_fpm' requires target feature 'fp8'}} + (void) vcvt_mf8_f16_fpm(vd4, vd4, fpm); + // expected-error@-1 {{'vcvt_mf8_f16_fpm' requires target feature 'fp8'}} + (void) vcvtq_mf8_f16_fpm(vd8, vd8, fpm); + // expected-error@-1 {{'vcvtq_mf8_f16_fpm' requires target feature 'fp8'}} +} From d617b03536b14f1aeec5392199a6cac2f4070532 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Fri, 3 Jan 2025 11:36:36 +0000 Subject: [PATCH 11/16] [fixup] Clear the NoManglingQ flag for FP8 --- clang/include/clang/Basic/arm_neon.td | 21 ++++++++++----------- clang/utils/TableGen/NeonEmitter.cpp | 1 - 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 968536bcf5186..64de6c0cf17a3 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2127,25 +2127,24 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "lut" in { let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { def VBF1CVT_BF16_MF8 : VInst<"vcvt1_bf16_mf8_fpm", "(QB).V", "m">; - def VBF1CVT_LOW_BF16_MF8 : VInst<"vcvt1_low_bf16_mf8_fpm", "B.V", "Qm">; + def VBF1CVT_LOW_BF16_MF8 : VInst<"vcvt1_low_bf16_mf8_fpm", "B.V", "Hm">; def VBF2CVTL_BF16_MF8 : VInst<"vcvt2_bf16_mf8_fpm", "(QB).V", "m">; - def VBF2CVTL_LOW_BF16_MF8 : VInst<"vcvt2_low_bf16_mf8_fpm", "B.V", "Qm">; - def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V", "Qm">; - def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V", "Qm">; + def VBF2CVTL_LOW_BF16_MF8 : VInst<"vcvt2_low_bf16_mf8_fpm", "B.V", "Hm">; + def VBF1CVTL2_HIGH_BF16_MF8 : VInst<"vcvt1_high_bf16_mf8_fpm", "B.V", "Hm">; + def VBF2CVTL2_HIGH_BF16_MF8 : VInst<"vcvt2_high_bf16_mf8_fpm", "B.V", "Hm">; } let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { def VF1CVT_F16_MF8 : VInst<"vcvt1_f16_mf8_fpm", "(>QF).V", "m">; - def VF1CVT_LOW_F16_MF8 : VInst<"vcvt1_low_f16_mf8_fpm", "(>F).V", "Qm">; + def VF1CVT_LOW_F16_MF8 : VInst<"vcvt1_low_f16_mf8_fpm", "(>F).V", "Hm">; def VF2CVTL_F16_MF8 : VInst<"vcvt2_f16_mf8_fpm", "(>QF).V", "m">; - def VF2CVTL_LOW_F16_MF8 : VInst<"vcvt2_low_f16_mf8_fpm", "(>F).V", "Qm">; - def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V", "Qm">; - def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V", "Qm">; + def VF2CVTL_LOW_F16_MF8 : VInst<"vcvt2_low_f16_mf8_fpm", "(>F).V", "Hm">; + def VF1CVTL2_HIGH_F16_MF8 : VInst<"vcvt1_high_f16_mf8_fpm", "(>F).V", "Hm">; + def VF2CVTL2_HIGH_F16_MF8 : VInst<"vcvt2_high_f16_mf8_fpm", "(>F).V", "Hm">; def VCVTN_LOW_F8_F32 : VInst<"vcvt_mf8_f32_fpm", ".(>>QF)(>>QF)V", "m">; - def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Qm">; - def VCVTN_F8_F16 : VInst<"vcvt_mf8_f16_fpm", ".(>F)(>F)V", "m">; - def VCVTNQ_F8_F16 : VInst<"vcvtq_mf8_f16_fpm", ".(>F)(>F)V", "Qm">; + def VCVTN_HIGH_F8_F32 : VInst<"vcvt_high_mf8_f32_fpm", ".(q)(>>F)(>>F)V", "Hm">; + def VCVTN_F8_F16 : VInst<"vcvt_mf8_f16_fpm", ".(>F)(>F)V", "mQm">; } let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in { diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp index 61067937acf72..eb679d002d3bb 100644 --- a/clang/utils/TableGen/NeonEmitter.cpp +++ b/clang/utils/TableGen/NeonEmitter.cpp @@ -897,7 +897,6 @@ void Type::applyTypespec(bool &Quad) { case 'm': Kind = MFloat8; ElementBitwidth = 8; - NoManglingQ = true; break; default: llvm_unreachable("Unhandled type code!"); From 95d61df48707ea8e63993a3655a423bcf6794f0d Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Mon, 13 Jan 2025 18:26:23 +0000 Subject: [PATCH 12/16] [fixup] Remove instcombine,tailcallelim from test run lines --- .../fp8-intrinsics/acle_neon_fp8_cvt.c | 160 +++++++++--------- 1 file changed, 84 insertions(+), 76 deletions(-) diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c index 7543938f48710..551ccb20c9cbd 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_cvt.c @@ -1,6 +1,6 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix CHECK-CXX +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -S -O3 -o /dev/null %s @@ -11,15 +11,15 @@ // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_bf16_mf8_fpm( // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) // CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt1_bf16_mf8_fpmu13__MFloat8x8_tm( // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] // bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { @@ -29,17 +29,17 @@ bfloat16x8_t test_vcvt1_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_low_bf16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) // CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt1_low_bf16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v8i8(<8 x i8> [[TMP0]]) // CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] // bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -49,15 +49,15 @@ bfloat16x8_t test_vcvt1_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_bf16_mf8_fpm( // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) // CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z23test_vcvt2_bf16_mf8_fpmu13__MFloat8x8_tm( // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] // bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { @@ -67,17 +67,17 @@ bfloat16x8_t test_vcvt2_bf16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_low_bf16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) // CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z27test_vcvt2_low_bf16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v8i8(<8 x i8> [[TMP0]]) // CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] // bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -87,15 +87,15 @@ bfloat16x8_t test_vcvt2_low_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt1_high_bf16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) // CHECK-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt1_high_bf16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl1.v8bf16.v16i8(<16 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT1_I]] // bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -105,15 +105,15 @@ bfloat16x8_t test_vcvt1_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x bfloat> @test_vcvt2_high_bf16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) // CHECK-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x bfloat> @_Z28test_vcvt2_high_bf16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x bfloat> @llvm.aarch64.neon.fp8.cvtl2.v8bf16.v16i8(<16 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x bfloat> [[VBFCVT2_I]] // bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -123,15 +123,15 @@ bfloat16x8_t test_vcvt2_high_bf16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_f16_mf8_fpm( // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) // CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt1_f16_mf8_fpmu13__MFloat8x8_tm( // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] // float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { @@ -141,17 +141,17 @@ float16x8_t test_vcvt1_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_low_f16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) // CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt1_low_f16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v8i8(<8 x i8> [[TMP0]]) // CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] // float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -161,15 +161,15 @@ float16x8_t test_vcvt1_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_f16_mf8_fpm( // CHECK-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) // CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z22test_vcvt2_f16_mf8_fpmu13__MFloat8x8_tm( // CHECK-CXX-SAME: <8 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] // float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { @@ -179,17 +179,17 @@ float16x8_t test_vcvt2_f16_mf8_fpm(mfloat8x8_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_low_f16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) // CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z26test_vcvt2_low_f16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <16 x i8> [[OP]], <16 x i8> poison, <8 x i32> -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <8 x i8> @llvm.vector.extract.v8i8.v16i8(<16 x i8> [[OP]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v8i8(<8 x i8> [[TMP0]]) // CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] // float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -199,15 +199,15 @@ float16x8_t test_vcvt2_low_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x half> @test_vcvt1_high_f16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) // CHECK-NEXT: ret <8 x half> [[VBFCVT1_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt1_high_f16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT1_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl1.v8f16.v16i8(<16 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT1_I]] // float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -217,15 +217,15 @@ float16x8_t test_vcvt1_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x half> @test_vcvt2_high_f16_mf8_fpm( // CHECK-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) // CHECK-NEXT: ret <8 x half> [[VBFCVT2_I]] // // CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z27test_vcvt2_high_f16_mf8_fpmu14__MFloat8x16_tm( // CHECK-CXX-SAME: <16 x i8> [[OP:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VBFCVT2_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.cvtl2.v8f16.v16i8(<16 x i8> [[OP]]) // CHECK-CXX-NEXT: ret <8 x half> [[VBFCVT2_I]] // float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { @@ -235,15 +235,15 @@ float16x8_t test_vcvt2_high_f16_mf8_fpm(mfloat8x16_t op, fpm_t fpm) { // CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f32_fpm( // CHECK-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) // CHECK-NEXT: ret <8 x i8> [[VFCVTN_I]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f32_fpm13__Float32x4_tS_m( // CHECK-CXX-SAME: <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VFCVTN_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f32(<4 x float> [[VN]], <4 x float> [[VM]]) // CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN_I]] // mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) { @@ -253,17 +253,17 @@ mfloat8x8_t test_vcvt_mf8_f32_fpm(float32x4_t vn, float32x4_t vm, fpm_t fpm) { // CHECK-LABEL: define dso_local <16 x i8> @test_vcvt_high_mf8_f32_fpm( // CHECK-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) // CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z26test_vcvt_high_mf8_f32_fpmu13__MFloat8x8_t13__Float32x4_tS_m( // CHECK-CXX-SAME: <8 x i8> [[VD:%.*]], <4 x float> noundef [[VN:%.*]], <4 x float> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VD]], <8 x i8> poison, <16 x i32> -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VD]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn2.v16i8.v4f32(<16 x i8> [[TMP0]], <4 x float> [[VN]], <4 x float> [[VM]]) // CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]] // mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn, @@ -274,15 +274,19 @@ mfloat8x16_t test_vcvt_high_mf8_f32_fpm(mfloat8x8_t vd, float32x4_t vn, // CHECK-LABEL: define dso_local <8 x i8> @test_vcvt_mf8_f16_fpm( // CHECK-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) // CHECK-NEXT: ret <8 x i8> [[VFCVTN2_I]] // // CHECK-CXX-LABEL: define dso_local <8 x i8> @_Z21test_vcvt_mf8_f16_fpm13__Float16x4_tS_m( // CHECK-CXX-SAME: <4 x half> noundef [[VN:%.*]], <4 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = tail call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VN]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <4 x half> [[VM]] to <8 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <8 x i8> @llvm.aarch64.neon.fp8.fcvtn.v8i8.v4f16(<4 x half> [[VN]], <4 x half> [[VM]]) // CHECK-CXX-NEXT: ret <8 x i8> [[VFCVTN2_I]] // mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) { @@ -292,15 +296,19 @@ mfloat8x8_t test_vcvt_mf8_f16_fpm(float16x4_t vn, float16x4_t vm, fpm_t fpm) { // CHECK-LABEL: define dso_local <16 x i8> @test_vcvtq_mf8_f16_fpm( // CHECK-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) // CHECK-NEXT: ret <16 x i8> [[VFCVTN2_I]] // // CHECK-CXX-LABEL: define dso_local <16 x i8> @_Z22test_vcvtq_mf8_f16_fpm13__Float16x8_tS_m( // CHECK-CXX-SAME: <8 x half> noundef [[VN:%.*]], <8 x half> noundef [[VM:%.*]], i64 noundef [[FPM:%.*]]) #[[ATTR0]] { // CHECK-CXX-NEXT: [[ENTRY:.*:]] -// CHECK-CXX-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) -// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = tail call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VN]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = bitcast <8 x half> [[VM]] to <16 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPM]]) +// CHECK-CXX-NEXT: [[VFCVTN2_I:%.*]] = call <16 x i8> @llvm.aarch64.neon.fp8.fcvtn.v16i8.v8f16(<8 x half> [[VN]], <8 x half> [[VM]]) // CHECK-CXX-NEXT: ret <16 x i8> [[VFCVTN2_I]] // mfloat8x16_t test_vcvtq_mf8_f16_fpm(float16x8_t vn, float16x8_t vm, fpm_t fpm) { From 6005e9e5beca028bc5067e8d8449a3b80fa47b6d Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 17 Dec 2024 11:42:42 +0000 Subject: [PATCH 13/16] [AArch64] Add FP8 Neon intrinsics for dot-product THis patch adds the following intrinsics: float16x4_t vdot_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpm) float16x8_t vdotq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpm) float16x4_t vdot_lane_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x4_t vdot_laneq_f16_mf8_fpm(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_lane_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, __builtin_constant_p(lane), fpm_t fpm) float16x8_t vdotq_laneq_f16_mf8_fpm(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, __builtin_constant_p(lane), fpm_t fpm) --- clang/include/clang/Basic/arm_neon.td | 22 +++ clang/include/clang/Basic/arm_neon_incl.td | 2 +- clang/lib/CodeGen/CGBuiltin.cpp | 47 ++++++ clang/lib/CodeGen/CodeGenFunction.h | 5 + .../fp8-intrinsics/acle_neon_fp8_fdot.c | 143 ++++++++++++++++++ .../acle_neon_fp8_fdot.c | 54 +++++++ llvm/include/llvm/IR/IntrinsicsAArch64.td | 21 +++ .../lib/Target/AArch64/AArch64InstrFormats.td | 82 ++++++---- llvm/lib/Target/AArch64/AArch64InstrInfo.td | 14 +- llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll | 74 +++++++++ 10 files changed, 424 insertions(+), 40 deletions(-) create mode 100644 clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c create mode 100644 clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c create mode 100644 llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 64de6c0cf17a3..5d1267213f238 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2147,6 +2147,28 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { def VCVTN_F8_F16 : VInst<"vcvt_mf8_f16_fpm", ".(>F)(>F)V", "mQm">; } +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in { + def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">; + def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">; + + def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>; + def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>; + + def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>; + def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>; +} + +let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in { + def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">; + def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">; + + def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>; + def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>; + + def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>; + def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>; +} + let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in { def FAMIN : WInst<"vamin", "...", "fhQdQfQh">; def FAMAX : WInst<"vamax", "...", "fhQdQfQh">; diff --git a/clang/include/clang/Basic/arm_neon_incl.td b/clang/include/clang/Basic/arm_neon_incl.td index 91a2bf3020b9a..b9b9d509c2251 100644 --- a/clang/include/clang/Basic/arm_neon_incl.td +++ b/clang/include/clang/Basic/arm_neon_incl.td @@ -302,7 +302,7 @@ class Inst ch = []>{ class SInst ch = []> : Inst {} class IInst ch = []> : Inst {} class WInst ch = []> : Inst {} -class VInst : Inst {} +class VInst ch = []> : Inst {} // The following instruction classes are implemented via operators // instead of builtins. As such these declarations are only used for diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index f0e94a76fca6f..9c1cef83103d8 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6905,6 +6905,25 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F, return EmitNeonCall(F, Ops, name); } +llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall( + unsigned IID, bool ExtendLane, llvm::Type *RetTy, + SmallVectorImpl &Ops, unsigned ICEArguments, + const CallExpr *E, const char *name) { + + const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() / + RetTy->getPrimitiveSizeInBits(); + llvm::Type *Tys[] = {llvm::FixedVectorType::get(RetTy, ElemCount), + Ops[1]->getType()}; + if (ExtendLane) { + auto *VT = llvm::FixedVectorType::get(Int8Ty, 16); + Ops[2] = Builder.CreateInsertVector(VT, PoisonValue::get(VT), Ops[2], + Builder.getInt64(0)); + } + llvm::Value *FPM = + EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name); +} + Value *CodeGenFunction::EmitNeonShiftVector(Value *V, llvm::Type *Ty, bool neg) { int SV = cast(V)->getSExtValue(); @@ -12865,6 +12884,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, unsigned Int; bool ExtractLow = false; + bool ExtendLane = false; switch (BuiltinID) { default: return nullptr; case NEON::BI__builtin_neon_vbsl_v: @@ -14132,6 +14152,33 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, return EmitFP8NeonCvtCall(Intrinsic::aarch64_neon_fp8_fcvtn2, Ty, Ops[1]->getType(), false, Ops, E, "vfcvtn2"); } + + case NEON::BI__builtin_neon_vdot_f16_mf8_fpm: + case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm: + return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy, + Ops, ICEArguments, E, "fdot2"); + case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm: + case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm: + ExtendLane = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm: + case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm: + return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane, + ExtendLane, HalfTy, Ops, ICEArguments, E, + "fdot2_lane"); + case NEON::BI__builtin_neon_vdot_f32_mf8_fpm: + case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm: + return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false, + FloatTy, Ops, ICEArguments, E, "fdot4"); + case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm: + case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm: + ExtendLane = true; + LLVM_FALLTHROUGH; + case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm: + case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm: + return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane, + ExtendLane, FloatTy, Ops, ICEArguments, E, + "fdot4_lane"); case NEON::BI__builtin_neon_vamin_f16: case NEON::BI__builtin_neon_vaminq_f16: case NEON::BI__builtin_neon_vamin_f32: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index 010b44f4d38d9..b5f690b2f6ee0 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4689,6 +4689,11 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Type *Ty1, bool Extract, SmallVectorImpl &Ops, const CallExpr *E, const char *name); + llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane, + llvm::Type *RetTy, + SmallVectorImpl &Ops, + unsigned ICEArguments, const CallExpr *E, + const char *name); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, const llvm::ElementCount &Count); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c new file mode 100644 index 0000000000000..b273bc2abe877 --- /dev/null +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c @@ -0,0 +1,143 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 + +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s + +// REQUIES: aarch64-registered-target + +#include + +// CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16( +// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-NEXT: ret <4 x half> [[FDOT21_I]] +// +float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { + return vdot_f16_mf8_fpm(vd, vn, vm, fpmr); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16( +// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-NEXT: ret <8 x half> [[FDOT21_I]] +// +float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { + return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16( +// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3) +// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]] +// +float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { + return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr); +} + +// CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16( +// CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]] +// +float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) { + return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16( +// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3) +// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]] +// +float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) { + return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr); +} + +// CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16( +// CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]] +// +float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { + return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr); +} + +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32( +// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-NEXT: ret <2 x float> [[FDOT4_I]] +// +float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { + return vdot_f32_mf8_fpm(vd, vn, vm, fpmr); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32( +// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-NEXT: ret <4 x float> [[FDOT4_I]] +// +float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { + return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr); +} + +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32( +// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) +// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]] +// +float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { + return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr); +} + +// CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32( +// CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]] +// +float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) { + return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32( +// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) +// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]] +// +float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) { + return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr); +} + +// CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32( +// CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: [[ENTRY:.*:]] +// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]] +// +float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { + return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr); +} diff --git a/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c new file mode 100644 index 0000000000000..8bfe3ac26ab2c --- /dev/null +++ b/clang/test/Sema/aarch64-fp8-intrinsics/acle_neon_fp8_fdot.c @@ -0,0 +1,54 @@ +// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +faminmax -target-feature +fp8 -emit-llvm -verify %s -o /dev/null + +// REQUIRES: aarch64-registered-target + +#include + +void test_features(float16x4_t vd4, float16x8_t vd8, float32x4_t va4, float32x2_t va2, + mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) { + (void) vdot_f16_mf8_fpm(vd4, v8, v8, fpm); +// expected-error@-1 {{'vdot_f16_mf8_fpm' requires target feature 'fp8dot2'}} + (void) vdotq_f16_mf8_fpm(vd8, v16, v16, fpm); +// expected-error@-1 {{'vdotq_f16_mf8_fpm' requires target feature 'fp8dot2'}} + (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, 3, fpm); +// expected-error@-1 {{'__builtin_neon_vdot_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}} + (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, 7, fpm); +// expected-error@-1 {{'__builtin_neon_vdot_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}} + (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, 3, fpm); +// expected-error@-1 {{'__builtin_neon_vdotq_lane_f16_mf8_fpm' needs target feature fp8dot2,neon}} + (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, 7, fpm); +// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f16_mf8_fpm' needs target feature fp8dot2,neon}} + + (void) vdot_f32_mf8_fpm(va2, v8, v8, fpm); +// expected-error@-1 {{'vdot_f32_mf8_fpm' requires target feature 'fp8dot4'}} + (void) vdotq_f32_mf8_fpm(va4, v16, v16, fpm); +// expected-error@-1 {{'vdotq_f32_mf8_fpm' requires target feature 'fp8dot4}} + (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, 1, fpm); +// expected-error@-1 {{'__builtin_neon_vdot_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}} + (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, 3, fpm); +// expected-error@-1 {{'__builtin_neon_vdot_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}} + (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, 1, fpm); +// expected-error@-1 {{'__builtin_neon_vdotq_lane_f32_mf8_fpm' needs target feature fp8dot4,neon}} + (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, 3, fpm); +// expected-error@-1 {{'__builtin_neon_vdotq_laneq_f32_mf8_fpm' needs target feature fp8dot4,neon}} +} + +void test_imm(float16x4_t vd4, float16x8_t vd8, float32x2_t va2, float32x4_t va4, + mfloat8x8_t v8, mfloat8x16_t v16, fpm_t fpm) { + (void) vdot_lane_f16_mf8_fpm(vd4, v8, v8, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} + (void) vdot_laneq_f16_mf8_fpm(vd4, v8, v16, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}} + (void) vdotq_lane_f16_mf8_fpm(vd8, v16, v8, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} + (void) vdotq_laneq_f16_mf8_fpm(vd8, v16, v16, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 7]}} + (void) vdot_lane_f32_mf8_fpm(va2, v8, v8, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} + (void) vdot_laneq_f32_mf8_fpm(va2, v8, v16, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} + (void) vdotq_lane_f32_mf8_fpm(va4, v16, v8, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 1]}} + (void) vdotq_laneq_f32_mf8_fpm(va4, v16, v16, -1, fpm); + // expected-error@-1 {{argument value -1 is outside the valid range [0, 3]}} +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td index 3f841f86f31d8..2c7ef00edfa30 100644 --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1026,6 +1026,27 @@ def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], dat llvm_anyvector_ty, LLVMMatchType<1>], [IntrReadMem, IntrInaccessibleMemOnly]>; + + // Dot-product + class AdvSIMD_FP8_DOT_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrReadMem, IntrInaccessibleMemOnly]>; + class AdvSIMD_FP8_DOT_LANE_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, + llvm_anyvector_ty, + llvm_v16i8_ty, + llvm_i32_ty], + [IntrReadMem, IntrInaccessibleMemOnly, ImmArg>]>; + + def int_aarch64_neon_fp8_fdot2 : AdvSIMD_FP8_DOT_Intrinsic; + def int_aarch64_neon_fp8_fdot2_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic; + + def int_aarch64_neon_fp8_fdot4 : AdvSIMD_FP8_DOT_Intrinsic; + def int_aarch64_neon_fp8_fdot4_lane : AdvSIMD_FP8_DOT_LANE_Intrinsic; } def llvm_nxv1i1_ty : LLVMType; diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td index 0ecc35f61903b..78cfdb412416d 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -6584,19 +6584,22 @@ multiclass SIMD_FP8_CVTN_F32 { (!cast(NAME # 2v16f8) V128:$_Rd, V128:$Rn, V128:$Rm)>; } -// TODO: Create a new Value Type v8f8 and v16f8 -multiclass SIMDThreeSameVectorDOT2 { - def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b", - V64, v4f16, v8i8, null_frag>; - def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b", - V128, v8f16, v16i8, null_frag>; +multiclass SIMD_FP8_Dot2 { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v4f16 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b01, 0b1111, asm, ".4h", ".8b", + V64, v4f16, v8i8, op>; + def v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1111, asm, ".8h", ".16b", + V128, v8f16, v16i8, op>; + } } -multiclass SIMDThreeSameVectorDOT4 { - def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b", - V64, v2f32, v8i8, null_frag>; - def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b", - V128, v4f32, v16i8, null_frag>; +multiclass SIMD_FP8_Dot4 { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v2f32 : BaseSIMDThreeSameVectorDot<0b0, 0b0, 0b00, 0b1111, asm, ".2s", ".8b", + V64, v2f32, v8i8, op>; + def v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b00, 0b1111, asm, ".4s", ".16b", + V128, v4f32, v16i8, op>; + } } let mayRaiseFPException = 1, Uses = [FPCR] in @@ -9140,15 +9143,16 @@ class BaseSIMDThreeSameVectorIndexS size, bits<4> opc, str string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, ValueType AccumType, ValueType InputType, + AsmVectorIndexOpnd VIdx, SDPatternOperator OpNode> : BaseSIMDIndexedTied { + VIdx:$idx)))))))]> { bits<2> idx; let Inst{21} = idx{0}; // L let Inst{11} = idx{1}; // H @@ -9157,17 +9161,24 @@ class BaseSIMDThreeSameVectorIndexS size, bits<4> opc, str multiclass SIMDThreeSameVectorDotIndex size, string asm, SDPatternOperator OpNode> { def v8i8 : BaseSIMDThreeSameVectorIndexS<0, U, size, {0b111, Mixed}, asm, ".2s", ".8b", ".4b", - V64, v2i32, v8i8, OpNode>; + V64, v2i32, v8i8, VectorIndexS, OpNode>; def v16i8 : BaseSIMDThreeSameVectorIndexS<1, U, size, {0b111, Mixed}, asm, ".4s", ".16b", ".4b", - V128, v4i32, v16i8, OpNode>; + V128, v4i32, v16i8, VectorIndexS, OpNode>; } -// TODO: The vectors v8i8 and v16i8 should be v8f8 and v16f8 -multiclass SIMDThreeSameVectorFP8DOT4Index { - def v8f8 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b", - V64, v2f32, v8i8, null_frag>; - def v16f8 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b", - V128, v4f32, v16i8, null_frag>; +multiclass SIMD_FP8_Dot4_Index { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b00, 0b0000, asm, ".2s", ".8b", ".4b", + V64, v2f32, v8i8, VectorIndexS32b_timm, null_frag>; + def v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b00, 0b0000, asm, ".4s", ".16b",".4b", + V128, v4f32, v16i8, VectorIndexS32b_timm, null_frag>; + } + + def : Pat<(v2f32 (op (v2f32 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)), + (!cast(NAME # v2f32) $Rd, $Rn, $Rm, $Idx)>; + + def : Pat<(v4f32 (op (v4f32 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128:$Rm), VectorIndexS32b_timm:$Idx)), + (!cast(NAME # v4f32) $Rd, $Rn, $Rm, $Idx)>; } // ARMv8.2-A Fused Multiply Add-Long Instructions (Indexed) @@ -9176,14 +9187,15 @@ class BaseSIMDThreeSameVectorIndexH sz, bits<4> opc, strin string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, RegisterOperand RegType_lo, ValueType AccumType, - ValueType InputType, SDPatternOperator OpNode> : + ValueType InputType, AsmVectorIndexOpnd VIdx, + SDPatternOperator OpNode> : BaseSIMDIndexedTied { + VIdx:$idx)))))]> { // idx = H:L:M bits<3> idx; let Inst{11} = idx{2}; // H @@ -9194,19 +9206,25 @@ class BaseSIMDThreeSameVectorIndexH sz, bits<4> opc, strin multiclass SIMDThreeSameVectorFMLIndex opc, string asm, SDPatternOperator OpNode> { def v4f16 : BaseSIMDThreeSameVectorIndexH<0, U, 0b10, opc, asm, ".2s", ".2h", ".h", - V64, V128_lo, v2f32, v4f16, OpNode>; + V64, V128_lo, v2f32, v4f16, VectorIndexH, OpNode>; def v8f16 : BaseSIMDThreeSameVectorIndexH<1, U, 0b10, opc, asm, ".4s", ".4h", ".h", - V128, V128_lo, v4f32, v8f16, OpNode>; + V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>; } //---------------------------------------------------------------------------- // FP8 Advanced SIMD vector x indexed element -// TODO: Replace value types v8i8 and v16i8 by v8f8 and v16f8 -multiclass SIMDThreeSameVectorFP8DOT2Index { - def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b", - V64, V128_lo, v4f16, v8i8, null_frag>; - def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b", - V128, V128_lo, v8f16, v8i16, null_frag>; +multiclass SIMD_FP8_Dot2_Index { + let Uses = [FPMR, FPCR], mayLoad = 1 in { + def v4f16 : BaseSIMDThreeSameVectorIndexH<0b0, 0b0, 0b01, 0b0000, asm, ".4h", ".8b", ".2b", + V64, V128_lo, v4f16, v8i8, VectorIndexH32b_timm, null_frag>; + def v8f16 : BaseSIMDThreeSameVectorIndexH<0b1, 0b0, 0b01, 0b0000, asm, ".8h", ".16b", ".2b", + V128, V128_lo, v8f16, v16i8, VectorIndexH32b_timm, null_frag>; + } + def : Pat<(v4f16 (op (v4f16 V64:$Rd), (v8i8 V64:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)), + (!cast(NAME # v4f16) $Rd, $Rn, $Rm, $Idx)>; + + def : Pat<(v8f16 (op (v8f16 V128:$Rd), (v16i8 V128:$Rn), (v16i8 V128_lo:$Rm), VectorIndexH32b_timm:$Idx)), + (!cast(NAME # v8f16) $Rd, $Rn, $Rm, $Idx)>; } multiclass SIMDFPIndexed opc, string asm, diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td index b415d843ac2e2..463a8f2f95e33 100644 --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -1489,7 +1489,7 @@ class BaseSIMDSUDOTIndex : BaseSIMDThreeSameVectorIndexS { + InputType, VectorIndexS, null_frag> { let Pattern = [(set (AccumType RegType:$dst), (AccumType (AArch64usdot (AccumType RegType:$Rd), (InputType (bitconvert (AccumType @@ -10346,14 +10346,14 @@ let Uses = [FPMR, FPCR], Predicates = [HasFP8FMA] in { defm FMLALLTT : SIMDThreeSameVectorMLAL<0b1, 0b01, "fmlalltt">; } // End let Predicates = [HasFP8FMA] -let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT2] in { - defm FDOTlane : SIMDThreeSameVectorFP8DOT2Index<"fdot">; - defm FDOT : SIMDThreeSameVectorDOT2<"fdot">; +let Predicates = [HasFP8DOT2] in { + defm FDOTlane : SIMD_FP8_Dot2_Index<"fdot", int_aarch64_neon_fp8_fdot2_lane>; + defm FDOT : SIMD_FP8_Dot2<"fdot", int_aarch64_neon_fp8_fdot2>; } // End let Predicates = [HasFP8DOT2] -let Uses = [FPMR, FPCR], Predicates = [HasFP8DOT4] in { - defm FDOTlane : SIMDThreeSameVectorFP8DOT4Index<"fdot">; - defm FDOT : SIMDThreeSameVectorDOT4<"fdot">; +let Predicates = [HasFP8DOT4] in { + defm FDOTlane : SIMD_FP8_Dot4_Index<"fdot", int_aarch64_neon_fp8_fdot4_lane>; + defm FDOT : SIMD_FP8_Dot4<"fdot", int_aarch64_neon_fp8_fdot4>; } // End let Predicates = [HasFP8DOT4] //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll new file mode 100644 index 0000000000000..b7a35c5fddf17 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/fp8-neon-fdot.ll @@ -0,0 +1,74 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 +; RUN: llc -mtriple=aarch64-linux -mattr=+neon,+fp8dot2,+fp8dot4 < %s | FileCheck %s + +define <4 x half> @test_fdot_f16(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm) { +; CHECK-LABEL: test_fdot_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.4h, v1.8b, v2.8b +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <8 x i8> %vm) + ret <4 x half> %res +} + +define <8 x half> @test_fdotq_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) { +; CHECK-LABEL: test_fdotq_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.8h, v1.16b, v2.16b +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) + ret <8 x half> %res +} + +define <4 x half> @test_fdot_lane_f16(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm) { +; CHECK-LABEL: test_fdot_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.4h, v1.8b, v2.2b[0] +; CHECK-NEXT: ret + %res = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0) + ret <4 x half> %res +} + +define <8 x half> @test_fdotq_lane_f16(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm) { +; CHECK-LABEL: test_fdotq_lane_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.8h, v1.16b, v2.2b[7] +; CHECK-NEXT: ret + %res = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 7) + ret <8 x half> %res +} + +define <2 x float> @test_fdot_f32(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm) { +; CHECK-LABEL: test_fdot_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.2s, v1.8b, v2.8b +; CHECK-NEXT: ret + %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <8 x i8> %vm) + ret <2 x float> %res +} + +define <4 x float> @test_fdotq_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) { +; CHECK-LABEL: test_fdotq_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.4s, v1.16b, v2.16b +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) + ret <4 x float> %res +} + +define <2 x float> @test_fdot_lane_f32(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm) { +; CHECK-LABEL: test_fdot_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.2s, v1.8b, v2.4b[0] +; CHECK-NEXT: ret + %res = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> %vd, <8 x i8> %vn, <16 x i8> %vm, i32 0) + ret <2 x float> %res +} + +define <4 x float> @test_fdotq_lane_f32(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm) { +; CHECK-LABEL: test_fdotq_lane_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: fdot v0.4s, v1.16b, v2.4b[3] +; CHECK-NEXT: ret + %res = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> %vd, <16 x i8> %vn, <16 x i8> %vm, i32 3) + ret <4 x float> %res +} From 4b0118447e342661972b1a1b59f1c7d579d91fb2 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 17 Dec 2024 13:23:31 +0000 Subject: [PATCH 14/16] [fixup] Remove not needed argument (NFC) --- clang/lib/CodeGen/CGBuiltin.cpp | 15 ++++++--------- clang/lib/CodeGen/CodeGenFunction.h | 3 +-- 2 files changed, 7 insertions(+), 11 deletions(-) diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp index 9c1cef83103d8..69ce28ecb642e 100644 --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6907,8 +6907,7 @@ Value *CodeGenFunction::EmitFP8NeonCall(Function *F, llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall( unsigned IID, bool ExtendLane, llvm::Type *RetTy, - SmallVectorImpl &Ops, unsigned ICEArguments, - const CallExpr *E, const char *name) { + SmallVectorImpl &Ops, const CallExpr *E, const char *name) { const unsigned ElemCount = Ops[0]->getType()->getPrimitiveSizeInBits() / RetTy->getPrimitiveSizeInBits(); @@ -6920,7 +6919,7 @@ llvm::Value *CodeGenFunction::EmitFP8NeonFDOTCall( Builder.getInt64(0)); } llvm::Value *FPM = - EmitScalarOrConstFoldImmArg(ICEArguments, E->getNumArgs() - 1, E); + EmitScalarOrConstFoldImmArg(/* ICEArguments */ 0, E->getNumArgs() - 1, E); return EmitFP8NeonCall(CGM.getIntrinsic(IID, Tys), Ops, FPM, name); } @@ -14156,7 +14155,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vdot_f16_mf8_fpm: case NEON::BI__builtin_neon_vdotq_f16_mf8_fpm: return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2, false, HalfTy, - Ops, ICEArguments, E, "fdot2"); + Ops, E, "fdot2"); case NEON::BI__builtin_neon_vdot_lane_f16_mf8_fpm: case NEON::BI__builtin_neon_vdotq_lane_f16_mf8_fpm: ExtendLane = true; @@ -14164,12 +14163,11 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vdot_laneq_f16_mf8_fpm: case NEON::BI__builtin_neon_vdotq_laneq_f16_mf8_fpm: return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot2_lane, - ExtendLane, HalfTy, Ops, ICEArguments, E, - "fdot2_lane"); + ExtendLane, HalfTy, Ops, E, "fdot2_lane"); case NEON::BI__builtin_neon_vdot_f32_mf8_fpm: case NEON::BI__builtin_neon_vdotq_f32_mf8_fpm: return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4, false, - FloatTy, Ops, ICEArguments, E, "fdot4"); + FloatTy, Ops, E, "fdot4"); case NEON::BI__builtin_neon_vdot_lane_f32_mf8_fpm: case NEON::BI__builtin_neon_vdotq_lane_f32_mf8_fpm: ExtendLane = true; @@ -14177,8 +14175,7 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID, case NEON::BI__builtin_neon_vdot_laneq_f32_mf8_fpm: case NEON::BI__builtin_neon_vdotq_laneq_f32_mf8_fpm: return EmitFP8NeonFDOTCall(Intrinsic::aarch64_neon_fp8_fdot4_lane, - ExtendLane, FloatTy, Ops, ICEArguments, E, - "fdot4_lane"); + ExtendLane, FloatTy, Ops, E, "fdot4_lane"); case NEON::BI__builtin_neon_vamin_f16: case NEON::BI__builtin_neon_vaminq_f16: case NEON::BI__builtin_neon_vamin_f32: diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h index b5f690b2f6ee0..9798a8f550724 100644 --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4692,8 +4692,7 @@ class CodeGenFunction : public CodeGenTypeCache { llvm::Value *EmitFP8NeonFDOTCall(unsigned IID, bool ExtendLane, llvm::Type *RetTy, SmallVectorImpl &Ops, - unsigned ICEArguments, const CallExpr *E, - const char *name); + const CallExpr *E, const char *name); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx, const llvm::ElementCount &Count); llvm::Value *EmitNeonSplat(llvm::Value *V, llvm::Constant *Idx); From b57c87e10d69f5acf4209003f082a0191f899e2e Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Fri, 3 Jan 2025 12:02:55 +0000 Subject: [PATCH 15/16] [fixup] Update intrinsics declarations --- clang/include/clang/Basic/arm_neon.td | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td index 5d1267213f238..6952cbd93bc91 100644 --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -2148,25 +2148,23 @@ let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8,neon" in { } let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot2,neon" in { - def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "m">; - def VDOTQ_F16_MF8 : VInst<"vdotq_f16_mf8_fpm", "(>F)(>F)..V", "Qm">; + def VDOT_F16_MF8 : VInst<"vdot_f16_mf8_fpm", "(>F)(>F)..V", "mQm">; - def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>; - def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>; + def VDOT_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F)..IV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>; + def VDOT_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F).QIV", "m", [ImmCheck<3, ImmCheck0_7, 0>]>; - def VDOTQ_LANE_F16_MF8 : VInst<"vdotq_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>; - def VDOTQ_LANEQ_F16_MF8 : VInst<"vdotq_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>; + def VDOTQ_LANE_F16_MF8 : VInst<"vdot_lane_f16_mf8_fpm", "(>F)(>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>; + def VDOTQ_LANEQ_F16_MF8 : VInst<"vdot_laneq_f16_mf8_fpm", "(>F)(>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_7, 0>]>; } let ArchGuard = "defined(__aarch64__)", TargetGuard = "fp8dot4,neon" in { - def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "m">; - def VDOTQ_F32_MF8 : VInst<"vdotq_f32_mf8_fpm", "(>>F)(>>F)..V", "Qm">; + def VDOT_F32_MF8 : VInst<"vdot_f32_mf8_fpm", "(>>F)(>>F)..V", "mQm">; - def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>; - def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>; + def VDOT_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F)..IV", "m", [ImmCheck<3, ImmCheck0_1, 0>]>; + def VDOT_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F).QIV", "m", [ImmCheck<3, ImmCheck0_3, 0>]>; - def VDOTQ_LANE_F32_MF8 : VInst<"vdotq_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>; - def VDOTQ_LANEQ_F32_MF8 : VInst<"vdotq_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>; + def VDOTQ_LANE_F32_MF8 : VInst<"vdot_lane_f32_mf8_fpm", "(>>F)(>>F).qIV", "Qm", [ImmCheck<3, ImmCheck0_1, 0>]>; + def VDOTQ_LANEQ_F32_MF8 : VInst<"vdot_laneq_f32_mf8_fpm", "(>>F)(>>F)..IV", "Qm", [ImmCheck<3, ImmCheck0_3, 0>]>; } let ArchGuard = "defined(__aarch64__)", TargetGuard = "neon,faminmax" in { From d7896c34fc681c020b39a5a6d7ddfc403d915d20 Mon Sep 17 00:00:00 2001 From: Momchil Velikov Date: Tue, 14 Jan 2025 10:35:21 +0000 Subject: [PATCH 16/16] [fixup] Add C++ runs to tests, remove some opt passes --- .../fp8-intrinsics/acle_neon_fp8_fdot.c | 171 +++++++++++++++--- 1 file changed, 141 insertions(+), 30 deletions(-) diff --git a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c index b273bc2abe877..4b565adf17861 100644 --- a/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c +++ b/clang/test/CodeGen/AArch64/fp8-intrinsics/acle_neon_fp8_fdot.c @@ -1,18 +1,30 @@ // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5 -// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s +// RUN: %clang_cc1 -x c++ -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg | FileCheck %s -check-prefix CHECK-CXX -// REQUIES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +neon -target-feature +bf16 -target-feature +fp8 -target-feature +fp8dot2 -target-feature +fp8dot4 -O3 -Werror -Wall -S -o /dev/null %s + +// REQUIRES: aarch64-registered-target #include // CHECK-LABEL: define dso_local <4 x half> @test_vdot_f16( // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) // CHECK-NEXT: ret <4 x half> [[FDOT21_I]] // +// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z13test_vdot_f1613__Float16x4_tu13__MFloat8x8_tu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT21_I:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-CXX-NEXT: ret <4 x half> [[FDOT21_I]] +// float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { return vdot_f16_mf8_fpm(vd, vn, vm, fpmr); } @@ -20,10 +32,19 @@ float16x4_t test_vdot_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_f16( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT21_I:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-NEXT: ret <8 x half> [[FDOT21_I]] // +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z14test_vdotq_f1613__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT21_I:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-CXX-NEXT: ret <8 x half> [[FDOT21_I]] +// float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { return vdotq_f16_mf8_fpm(vd, vn, vm, fpmr); } @@ -31,11 +52,23 @@ float16x8_t test_vdotq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm // CHECK-LABEL: define dso_local <4 x half> @test_vdot_lane_f16( // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) // CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]] // +// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z18test_vdot_lane_f1613__Float16x4_tu13__MFloat8x8_tu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) +// CHECK-CXX-NEXT: ret <4 x half> [[FDOT2_LANE1]] +// float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { return vdot_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr); } @@ -43,10 +76,21 @@ float16x4_t test_vdot_lane_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x8_t vm, f // CHECK-LABEL: define dso_local <4 x half> @test_vdot_laneq_f16( // CHECK-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7) // CHECK-NEXT: ret <4 x half> [[FDOT2_LANE1]] // +// CHECK-CXX-LABEL: define dso_local noundef <4 x half> @_Z19test_vdot_laneq_f1613__Float16x4_tu13__MFloat8x8_tu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <4 x half> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <4 x half> [[VD]] to <8 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <8 x i8> [[TMP0]] to <4 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <4 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v4f16.v8i8(<4 x half> [[FDOT2_LANE]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-CXX-NEXT: ret <4 x half> [[FDOT2_LANE1]] +// float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) { return vdot_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr); } @@ -54,11 +98,23 @@ float16x4_t test_vdot_laneq_f16(float16x4_t vd, mfloat8x8_t vn, mfloat8x16_t vm, // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_lane_f16( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 3) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) // CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]] // +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z19test_vdotq_lane_f1613__Float16x8_tu14__MFloat8x16_tu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[TMP1]], i32 3) +// CHECK-CXX-NEXT: ret <8 x half> [[FDOT2_LANE1]] +// float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) { return vdotq_lane_f16_mf8_fpm(vd, vn, vm, 3, fpmr); } @@ -66,10 +122,21 @@ float16x8_t test_vdotq_lane_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x8_t vm, // CHECK-LABEL: define dso_local <8 x half> @test_vdotq_laneq_f16( // CHECK-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = tail call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) // CHECK-NEXT: ret <8 x half> [[FDOT2_LANE1]] // +// CHECK-CXX-LABEL: define dso_local noundef <8 x half> @_Z20test_vdotq_laneq_f1613__Float16x8_tu14__MFloat8x16_tu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <8 x half> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[VD]] to <16 x i8> +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT2_LANE:%.*]] = bitcast <16 x i8> [[TMP0]] to <8 x half> +// CHECK-CXX-NEXT: [[FDOT2_LANE1:%.*]] = call <8 x half> @llvm.aarch64.neon.fp8.fdot2.lane.v8f16.v16i8(<8 x half> [[FDOT2_LANE]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 7) +// CHECK-CXX-NEXT: ret <8 x half> [[FDOT2_LANE1]] +// float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { return vdotq_laneq_f16_mf8_fpm(vd, vn, vm, 7, fpmr); } @@ -77,10 +144,17 @@ float16x8_t test_vdotq_laneq_f16(float16x8_t vd, mfloat8x16_t vn, mfloat8x16_t v // CHECK-LABEL: define dso_local <2 x float> @test_vdot_f32( // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) // CHECK-NEXT: ret <2 x float> [[FDOT4_I]] // +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z13test_vdot_f3213__Float32x2_tu13__MFloat8x8_tu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT4_I:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <8 x i8> [[VM]]) +// CHECK-CXX-NEXT: ret <2 x float> [[FDOT4_I]] +// float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { return vdot_f32_mf8_fpm(vd, vn, vm, fpmr); } @@ -88,10 +162,17 @@ float32x2_t test_vdot_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_f32( // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT4_I:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) // CHECK-NEXT: ret <4 x float> [[FDOT4_I]] // +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z14test_vdotq_f3213__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT4_I:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]]) +// CHECK-CXX-NEXT: ret <4 x float> [[FDOT4_I]] +// float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { return vdotq_f32_mf8_fpm(vd, vn, vm, fpmr); } @@ -99,11 +180,19 @@ float32x4_t test_vdotq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm // CHECK-LABEL: define dso_local <2 x float> @test_vdot_lane_f32( // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) // CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]] // +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z18test_vdot_lane_f3213__Float32x2_tu13__MFloat8x8_tu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) +// CHECK-CXX-NEXT: ret <2 x float> [[FDOT4_LANE]] +// float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, fpm_t fpmr) { return vdot_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr); } @@ -111,10 +200,17 @@ float32x2_t test_vdot_lane_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x8_t vm, f // CHECK-LABEL: define dso_local <2 x float> @test_vdot_laneq_f32( // CHECK-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3) // CHECK-NEXT: ret <2 x float> [[FDOT4_LANE]] // +// CHECK-CXX-LABEL: define dso_local noundef <2 x float> @_Z19test_vdot_laneq_f3213__Float32x2_tu13__MFloat8x8_tu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <2 x float> noundef [[VD:%.*]], <8 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT4_LANE:%.*]] = call <2 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v2f32.v8i8(<2 x float> [[VD]], <8 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-CXX-NEXT: ret <2 x float> [[FDOT4_LANE]] +// float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, fpm_t fpmr) { return vdot_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr); } @@ -122,11 +218,19 @@ float32x2_t test_vdot_laneq_f32(float32x2_t vd, mfloat8x8_t vn, mfloat8x16_t vm, // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_lane_f32( // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x i8> [[VM]], <8 x i8> poison, <16 x i32> -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) // CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]] // +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z19test_vdotq_lane_f3213__Float32x4_tu14__MFloat8x16_tu13__MFloat8x8_tm( +// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <8 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.vector.insert.v16i8.v8i8(<16 x i8> poison, <8 x i8> [[VM]], i64 0) +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[TMP0]], i32 1) +// CHECK-CXX-NEXT: ret <4 x float> [[FDOT4_LANE]] +// float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, fpm_t fpmr) { return vdotq_lane_f32_mf8_fpm(vd, vn, vm, 1, fpmr); } @@ -134,10 +238,17 @@ float32x4_t test_vdotq_lane_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x8_t vm, // CHECK-LABEL: define dso_local <4 x float> @test_vdotq_laneq_f32( // CHECK-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { // CHECK-NEXT: [[ENTRY:.*:]] -// CHECK-NEXT: tail call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) -// CHECK-NEXT: [[FDOT4_LANE:%.*]] = tail call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-NEXT: [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3) // CHECK-NEXT: ret <4 x float> [[FDOT4_LANE]] // +// CHECK-CXX-LABEL: define dso_local noundef <4 x float> @_Z20test_vdotq_laneq_f3213__Float32x4_tu14__MFloat8x16_tu14__MFloat8x16_tm( +// CHECK-CXX-SAME: <4 x float> noundef [[VD:%.*]], <16 x i8> [[VN:%.*]], <16 x i8> [[VM:%.*]], i64 noundef [[FPMR:%.*]]) #[[ATTR0]] { +// CHECK-CXX-NEXT: [[ENTRY:.*:]] +// CHECK-CXX-NEXT: call void @llvm.aarch64.set.fpmr(i64 [[FPMR]]) +// CHECK-CXX-NEXT: [[FDOT4_LANE:%.*]] = call <4 x float> @llvm.aarch64.neon.fp8.fdot4.lane.v4f32.v16i8(<4 x float> [[VD]], <16 x i8> [[VN]], <16 x i8> [[VM]], i32 3) +// CHECK-CXX-NEXT: ret <4 x float> [[FDOT4_LANE]] +// float32x4_t test_vdotq_laneq_f32(float32x4_t vd, mfloat8x16_t vn, mfloat8x16_t vm, fpm_t fpmr) { return vdotq_laneq_f32_mf8_fpm(vd, vn, vm, 3, fpmr); }