diff --git a/clang/lib/Basic/Targets/X86.cpp b/clang/lib/Basic/Targets/X86.cpp index 84a05cec04e7f..e4d3ad04fe9de 100644 --- a/clang/lib/Basic/Targets/X86.cpp +++ b/clang/lib/Basic/Targets/X86.cpp @@ -1109,6 +1109,10 @@ void X86TargetInfo::getTargetDefines(const LangOptions &Opts, if (HasFloat128) Builder.defineMacro("__SIZEOF_FLOAT128__", "16"); + + if (Opts.CFProtectionReturn || Opts.CFProtectionBranch) + Builder.defineMacro("__CET__", Twine{(Opts.CFProtectionReturn << 1) | + Opts.CFProtectionBranch}); } bool X86TargetInfo::isValidFeatureName(StringRef Name) const { diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp index bae41fc06c036..d5a7fc7e85230 100644 --- a/clang/lib/Driver/ToolChains/MSVC.cpp +++ b/clang/lib/Driver/ToolChains/MSVC.cpp @@ -232,6 +232,11 @@ void visualstudio::Linker::ConstructJob(Compilation &C, const JobAction &JA, } } + if (C.getDriver().isUsingLTO()) { + if (Arg *A = tools::getLastProfileSampleUseArg(Args)) + CmdArgs.push_back(Args.MakeArgString(std::string("-lto-sample-profile:") + + A->getValue())); + } Args.AddAllArgValues(CmdArgs, options::OPT__SLASH_link); // Control Flow Guard checks diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index b9a5c0589ebc4..4eb743acf327f 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -4776,17 +4776,6 @@ static bool ParsePreprocessorArgs(PreprocessorOptions &Opts, ArgList &Args, } } - // Add the __CET__ macro if a CFProtection option is set. - if (const Arg *A = Args.getLastArg(OPT_fcf_protection_EQ)) { - StringRef Name = A->getValue(); - if (Name == "branch") - Opts.addMacroDef("__CET__=1"); - else if (Name == "return") - Opts.addMacroDef("__CET__=2"); - else if (Name == "full") - Opts.addMacroDef("__CET__=3"); - } - // Add macros from the command line. for (const auto *A : Args.filtered(OPT_D, OPT_U)) { if (A->getOption().matches(OPT_D)) diff --git a/clang/lib/Sema/SemaCast.cpp b/clang/lib/Sema/SemaCast.cpp index 8972957ded9f5..89e8082ee80e7 100644 --- a/clang/lib/Sema/SemaCast.cpp +++ b/clang/lib/Sema/SemaCast.cpp @@ -104,6 +104,7 @@ namespace { void CheckStaticCast(); void CheckDynamicCast(); void CheckCXXCStyleCast(bool FunctionalCast, bool ListInitialization); + bool CheckHLSLCStyleCast(CheckedConversionKind CCK); void CheckCStyleCast(); void CheckBuiltinBitCast(); void CheckAddrspaceCast(); @@ -2776,39 +2777,9 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle, CheckedConversionKind CCK = FunctionalStyle ? CheckedConversionKind::FunctionalCast : CheckedConversionKind::CStyleCast; - - QualType SrcTy = SrcExpr.get()->getType(); - // This case should not trigger on regular vector cast, vector truncation - if (Self.getLangOpts().HLSL && - Self.HLSL().CanPerformElementwiseCast(SrcExpr.get(), DestType)) { - if (SrcTy->isConstantArrayType()) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), - CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK); - Kind = CK_HLSLElementwiseCast; - return; - } - - // This case should not trigger on regular vector splat - // If the relative order of this and the HLSLElementWise cast checks - // are changed, it might change which cast handles what in a few cases - if (Self.getLangOpts().HLSL && - Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) { - const VectorType *VT = SrcTy->getAs(); - // change splat from vec1 case to splat from scalar - if (VT && VT->getNumElements() == 1) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), VT->getElementType(), CK_HLSLVectorTruncation, - SrcExpr.get()->getValueKind(), nullptr, CCK); - // Inserting a scalar cast here allows for a simplified codegen in - // the case the destTy is a vector - if (const VectorType *DVT = DestType->getAs()) - SrcExpr = Self.ImpCastExprToType( - SrcExpr.get(), DVT->getElementType(), - Self.PrepareScalarCast(SrcExpr, DVT->getElementType()), - SrcExpr.get()->getValueKind(), nullptr, CCK); - Kind = CK_HLSLAggregateSplatCast; - return; + if (Self.getLangOpts().HLSL) { + if (CheckHLSLCStyleCast(CCK)) + return; } if (ValueKind == VK_PRValue && !DestType->isRecordType() && @@ -2927,6 +2898,56 @@ void CastOperation::CheckCXXCStyleCast(bool FunctionalStyle, } } +// CheckHLSLCStyleCast - Returns `true` ihe cast is handled or errored as an +// HLSL-specific cast. Returns false if the cast should be checked as a CXX +// C-Style cast. +bool CastOperation::CheckHLSLCStyleCast(CheckedConversionKind CCK) { + assert(Self.getLangOpts().HLSL && "Must be HLSL!"); + QualType SrcTy = SrcExpr.get()->getType(); + // HLSL has several unique forms of C-style casts which support aggregate to + // aggregate casting. + // This case should not trigger on regular vector cast, vector truncation + if (Self.HLSL().CanPerformElementwiseCast(SrcExpr.get(), DestType)) { + if (SrcTy->isConstantArrayType()) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), Self.Context.getArrayParameterType(SrcTy), + CK_HLSLArrayRValue, VK_PRValue, nullptr, CCK); + Kind = CK_HLSLElementwiseCast; + return true; + } + + // This case should not trigger on regular vector splat + // If the relative order of this and the HLSLElementWise cast checks + // are changed, it might change which cast handles what in a few cases + if (Self.HLSL().CanPerformAggregateSplatCast(SrcExpr.get(), DestType)) { + const VectorType *VT = SrcTy->getAs(); + // change splat from vec1 case to splat from scalar + if (VT && VT->getNumElements() == 1) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), VT->getElementType(), CK_HLSLVectorTruncation, + SrcExpr.get()->getValueKind(), nullptr, CCK); + // Inserting a scalar cast here allows for a simplified codegen in + // the case the destTy is a vector + if (const VectorType *DVT = DestType->getAs()) + SrcExpr = Self.ImpCastExprToType( + SrcExpr.get(), DVT->getElementType(), + Self.PrepareScalarCast(SrcExpr, DVT->getElementType()), + SrcExpr.get()->getValueKind(), nullptr, CCK); + Kind = CK_HLSLAggregateSplatCast; + return true; + } + + // If the destination is an array, we've exhausted the valid HLSL casts, so we + // should emit a dignostic and stop processing. + if (DestType->isArrayType()) { + Self.Diag(OpRange.getBegin(), diag::err_bad_cxx_cast_generic) + << 4 << SrcTy << DestType; + SrcExpr = ExprError(); + return true; + } + return false; +} + /// DiagnoseBadFunctionCast - Warn whenever a function call is cast to a /// non-matching type. Such as enum function call to int, int call to /// pointer; etc. Cast to 'void' is an exception. diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp index 6a76e6d74a4b0..a34005bf376aa 100644 --- a/clang/lib/Sema/SemaInit.cpp +++ b/clang/lib/Sema/SemaInit.cpp @@ -6585,6 +6585,18 @@ void InitializationSequence::InitializeFrom(Sema &S, } } + if (S.getLangOpts().HLSL && Initializer && isa(DestAT)) { + QualType SrcType = Entity.getType(); + if (SrcType->isArrayParameterType()) + SrcType = + cast(SrcType)->getConstantArrayType(Context); + if (S.Context.hasSameUnqualifiedType(DestType, SrcType)) { + TryArrayCopy(S, Kind, Entity, Initializer, DestType, *this, + TreatUnavailableAsInvalid); + return; + } + } + // Some kinds of initialization permit an array to be initialized from // another array of the same type, and perform elementwise initialization. if (Initializer && isa(DestAT) && diff --git a/clang/test/Driver/cl-link.c b/clang/test/Driver/cl-link.c index 9bf8a8137926d..726bc26a64edd 100644 --- a/clang/test/Driver/cl-link.c +++ b/clang/test/Driver/cl-link.c @@ -71,3 +71,6 @@ // RUN: %clang_cl -m32 -arch:IA32 --target=i386-pc-win32 /Tc%s -fuse-ld=lld -### -fsanitize=address 2>&1 | FileCheck --check-prefix=INFER-LLD %s // INFER-LLD: lld-link // INFER-LLD-NOT: INFERASANLIBS + +// RUN: %clang_cl --target=x86_64-unknown-windows-msvc /Tc%s -flto -fuse-ld=lld -### -fprofile-sample-use=%S/Inputs/file.prof 2>&1 | FileCheck -check-prefix=CHECK-SAMPLE-PROFILE %s +// CHECK-SAMPLE-PROFILE: "-lto-sample-profile:{{.*}}/file.prof" diff --git a/clang/test/Preprocessor/riscv-cf-protection-return.c b/clang/test/Preprocessor/riscv-cf-protection-return.c index 3a93a88fa6839..a4cbaa1edf68c 100644 --- a/clang/test/Preprocessor/riscv-cf-protection-return.c +++ b/clang/test/Preprocessor/riscv-cf-protection-return.c @@ -40,5 +40,7 @@ // RUN: -menable-experimental-extensions -fcf-protection=full -E -dM %s -o - \ // RUN: | FileCheck --check-prefixes=SHSTK-MACRO %s +// SHSTK-MACRO-NOT: __CET__ // SHSTK-MACRO: __riscv_shadow_stack 1{{$}} +// SHSTK-MACRO-NOT: __CET__ // NO-MACRO-NOT: __riscv_shadow_stack diff --git a/clang/test/SemaHLSL/Language/AssignArray.hlsl b/clang/test/SemaHLSL/Language/AssignArray.hlsl new file mode 100644 index 0000000000000..1f813e7a350b1 --- /dev/null +++ b/clang/test/SemaHLSL/Language/AssignArray.hlsl @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library %s -ast-dump | FileCheck %s + +typedef vector int8[2]; + +export void fn(int8 A) { + int8 a = {A}; +// CHECK-LABEL: VarDecl {{.*}} b 'int8':'vector[2]' cinit +// CHECK-NEXT: ArrayInitLoopExpr {{.*}} 'int8':'vector[2]' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'vector' lvalue +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'int8':'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'int8':'vector[2]' lvalue Var {{.*}} 'a' 'int8':'vector[2]' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' + int8 b = a; + +// CHECK-LABEL: VarDecl {{.*}} c 'int8':'vector[2]' cinit +// CHECK-NEXT: ArrayInitLoopExpr {{.*}} 'int8':'vector[2]' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector' +// CHECK-NEXT: ArraySubscriptExpr {{.*}} 'vector' lvalue +// CHECK-NEXT: ImplicitCastExpr {{.*}} 'vector *' +// CHECK-NEXT: OpaqueValueExpr {{.*}} 'vector[2]' lvalue +// CHECK-NEXT: DeclRefExpr {{.*}} 'vector[2]' lvalue ParmVar {{.*}} 'A' 'vector[2]' +// CHECK-NEXT: ArrayInitIndexExpr {{.*}} 'unsigned long' + int8 c = A; +} + + + + diff --git a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl index 9417249383469..30591507b3260 100644 --- a/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl +++ b/clang/test/SemaHLSL/Language/ElementwiseCast-errors.hlsl @@ -4,7 +4,7 @@ export void cantCast() { int A[3] = {1,2,3}; int B[4] = {1,2,3,4}; B = (int[4])A; - // expected-error@-1 {{C-style cast from 'int *' to 'int[4]' is not allowed}} + // expected-error@-1 {{C-style cast from 'int[3]' to 'int[4]' is not allowed}} } struct S { diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c index a9d8261bd03e7..0e7de8b98ea07 100644 --- a/clang/tools/c-index-test/c-index-test.c +++ b/clang/tools/c-index-test/c-index-test.c @@ -1213,28 +1213,36 @@ static void PrintCursor(CXCursor Cursor, const char *CommentSchemaFile) { } } -static const char* GetCursorSource(CXCursor Cursor) { +static CXString createCXString(const char *CS) { + CXString Str; + Str.data = (const void *)CS; + Str.private_flags = 0; + return Str; +} + +static CXString duplicateCXString(const char *CS) { + CXString Str; + Str.data = strdup(CS); + Str.private_flags = 1; // CXS_Malloc + return Str; +} + +static CXString GetCursorSource(CXCursor Cursor) { CXSourceLocation Loc = clang_getCursorLocation(Cursor); CXString source; CXFile file; + const char *b; + CXString result; clang_getExpansionLocation(Loc, &file, 0, 0, 0); source = clang_getFileName(file); if (!clang_getCString(source)) { clang_disposeString(source); - return ""; - } - else { - const char *b = basename(clang_getCString(source)); - clang_disposeString(source); - return b; + return createCXString(""); } -} - -static CXString createCXString(const char *CS) { - CXString Str; - Str.data = (const void *) CS; - Str.private_flags = 0; - return Str; + b = basename(clang_getCString(source)); + result = duplicateCXString(b); + clang_disposeString(source); + return result; } /******************************************************************************/ @@ -1357,9 +1365,12 @@ enum CXChildVisitResult FilteredPrintingVisitor(CXCursor Cursor, if (!Data->Filter || (Cursor.kind == *(enum CXCursorKind *)Data->Filter)) { CXSourceLocation Loc = clang_getCursorLocation(Cursor); unsigned line, column; + CXString source; clang_getFileLocation(Loc, 0, &line, &column, 0); - printf("// %s: %s:%d:%d: ", FileCheckPrefix, - GetCursorSource(Cursor), line, column); + source = GetCursorSource(Cursor); + printf("// %s: %s:%d:%d: ", FileCheckPrefix, clang_getCString(source), line, + column); + clang_disposeString(source); PrintCursor(Cursor, Data->CommentSchemaFile); PrintCursorExtent(Cursor); if (clang_isDeclaration(Cursor.kind)) { @@ -1428,8 +1439,10 @@ static enum CXChildVisitResult FunctionScanVisitor(CXCursor Cursor, if (Ref.kind == CXCursor_NoDeclFound) { /* Nothing found here; that's fine. */ } else if (Ref.kind != CXCursor_FunctionDecl) { - printf("// %s: %s:%d:%d: ", FileCheckPrefix, GetCursorSource(Ref), - curLine, curColumn); + CXString CursorSource = GetCursorSource(Ref); + printf("// %s: %s:%d:%d: ", FileCheckPrefix, + clang_getCString(CursorSource), curLine, curColumn); + clang_disposeString(CursorSource); PrintCursor(Ref, Data->CommentSchemaFile); printf("\n"); } @@ -1451,11 +1464,15 @@ enum CXChildVisitResult USRVisitor(CXCursor C, CXCursor parent, if (!Data->Filter || (C.kind == *(enum CXCursorKind *)Data->Filter)) { CXString USR = clang_getCursorUSR(C); const char *cstr = clang_getCString(USR); + CXString CursorSource; if (!cstr || cstr[0] == '\0') { clang_disposeString(USR); return CXChildVisit_Recurse; } - printf("// %s: %s %s", FileCheckPrefix, GetCursorSource(C), cstr); + CursorSource = GetCursorSource(C); + printf("// %s: %s %s", FileCheckPrefix, clang_getCString(CursorSource), + cstr); + clang_disposeString(CursorSource); PrintCursorExtent(C); printf("\n"); diff --git a/compiler-rt/test/sanitizer_common/TestCases/Posix/getpass.cpp b/compiler-rt/test/sanitizer_common/TestCases/Posix/getpass.cpp index 2711cfb112959..a4ca4a3c2f35d 100644 --- a/compiler-rt/test/sanitizer_common/TestCases/Posix/getpass.cpp +++ b/compiler-rt/test/sanitizer_common/TestCases/Posix/getpass.cpp @@ -3,7 +3,6 @@ // Ignore leaks as this is not the point of test, but HWASAN repors one here. // RUN: %env_tool_opts=detect_leaks=0 %run %t | FileCheck %s -// REQUIRES: stable-runtime // XFAIL: android && asan // No libutil. diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index ccaa784ccb088..dcf9838edd74b 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -174,7 +174,7 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_atomic_flag_test`` ``201907L`` ---------------------------------------------------------- ----------------- - ``__cpp_lib_atomic_float`` *unimplemented* + ``__cpp_lib_atomic_float`` ``201711L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_atomic_lock_free_type_aliases`` ``201907L`` ---------------------------------------------------------- ----------------- diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv index 360b5520260ce..1c060c32b6f57 100644 --- a/libcxx/docs/Status/Cxx20Papers.csv +++ b/libcxx/docs/Status/Cxx20Papers.csv @@ -2,7 +2,7 @@ "`P0463R1 `__","Endian just Endian","2017-07 (Toronto)","|Complete|","7","" "`P0674R1 `__","Extending make_shared to Support Arrays","2017-07 (Toronto)","|Complete|","15","" "","","","","","" -"`P0020R6 `__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","" +"`P0020R6 `__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","The feature-test macro was not set until LLVM 20." "`P0053R7 `__","C++ Synchronized Buffered Ostream","2017-11 (Albuquerque)","|Complete|","18","" "`P0202R3 `__","Add constexpr modifiers to functions in and Headers","2017-11 (Albuquerque)","|Complete|","12","" "`P0415R1 `__","Constexpr for ``std::complex``\ ","2017-11 (Albuquerque)","|Complete|","16","" diff --git a/libcxx/include/version b/libcxx/include/version index c5966b90c061d..63ead9fd5d29d 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -378,7 +378,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_array_constexpr 201811L # define __cpp_lib_assume_aligned 201811L # define __cpp_lib_atomic_flag_test 201907L -// # define __cpp_lib_atomic_float 201711L +# define __cpp_lib_atomic_float 201711L # define __cpp_lib_atomic_lock_free_type_aliases 201907L # define __cpp_lib_atomic_ref 201806L // # define __cpp_lib_atomic_shared_ptr 201711L diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp index 9ed18fbfe19ac..5a21e6320bffe 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/atomic.version.compile.pass.cpp @@ -169,17 +169,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++20" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++20" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++20" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++20" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -262,17 +256,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++23" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++23" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++23" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -355,17 +343,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++26" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++26" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++26" # endif # ifndef __cpp_lib_atomic_is_always_lock_free diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index 137d6cb428930..1e4465d515e6b 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -3282,17 +3282,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++20" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++20" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++20" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++20" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++20" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -4707,17 +4701,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++23" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++23" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++23" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++23" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++23" # endif # ifndef __cpp_lib_atomic_is_always_lock_free @@ -6369,17 +6357,11 @@ # error "__cpp_lib_atomic_flag_test should have the value 201907L in c++26" # endif -# if !defined(_LIBCPP_VERSION) -# ifndef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should be defined in c++26" -# endif -# if __cpp_lib_atomic_float != 201711L -# error "__cpp_lib_atomic_float should have the value 201711L in c++26" -# endif -# else // _LIBCPP_VERSION -# ifdef __cpp_lib_atomic_float -# error "__cpp_lib_atomic_float should not be defined because it is unimplemented in libc++!" -# endif +# ifndef __cpp_lib_atomic_float +# error "__cpp_lib_atomic_float should be defined in c++26" +# endif +# if __cpp_lib_atomic_float != 201711L +# error "__cpp_lib_atomic_float should have the value 201711L in c++26" # endif # ifndef __cpp_lib_atomic_is_always_lock_free diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py index 25168b9087754..8bf7633e985d5 100755 --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -169,7 +169,6 @@ def add_version_header(tc): "name": "__cpp_lib_atomic_float", "values": {"c++20": 201711}, "headers": ["atomic"], - "unimplemented": True, }, { "name": "__cpp_lib_atomic_is_always_lock_free", diff --git a/lld/test/ELF/input-section-flags.s b/lld/test/ELF/input-section-flags.s index 0c8e31c77b0dc..f848d55e6fddc 100644 --- a/lld/test/ELF/input-section-flags.s +++ b/lld/test/ELF/input-section-flags.s @@ -15,6 +15,7 @@ # RUN: .outsec3 : { INPUT_SECTION_FLAGS(SHF_WRITE) *(.sec.*) } \ # RUN: .outsec4 : { INPUT_SECTION_FLAGS(SHF_MERGE & !SHF_STRINGS) *(.sec.*) } \ # RUN: .outsec5 : { INPUT_SECTION_FLAGS(SHF_STRINGS) *(.sec.*) } \ +# RUN: .outsec6 : { INPUT_SECTION_FLAGS(!SHF_TLS & !SHF_EXCLUDE & !SHF_COMPRESSED & !SHF_ARM_PURECODE) *(.sec.*) } \ # RUN: } " > %t.script # RUN: ld.lld -o %t1 --script %t.script %t.o # RUN: llvm-readobj --symbols %t1 | FileCheck %s diff --git a/lld/test/wasm/data-segments.ll b/lld/test/wasm/data-segments.ll index 41868a0b2b50b..79f1d384919d9 100644 --- a/lld/test/wasm/data-segments.ll +++ b/lld/test/wasm/data-segments.ll @@ -6,36 +6,36 @@ ; RUN: llc --mtriple=wasm32-unknown-unknown -filetype=obj %s -o %t.atomics.bulk-mem.pic.o -relocation-model=pic -mattr=+atomics,+bulk-memory,+mutable-globals ; RUN: llc --mtriple=wasm64-unknown-unknown -filetype=obj %s -o %t.atomics.bulk-mem.pic-mem64.o -relocation-model=pic -mattr=+atomics,+bulk-memory,+mutable-globals -; atomics, shared memory => error +;; atomics, shared memory => error ; RUN: not wasm-ld -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.o -o %t.atomics.wasm 2>&1 | FileCheck %s --check-prefix ERROR -; bulk memory, unshared memory => active segments +;; bulk memory, unshared memory => active segments ; RUN: wasm-ld -no-gc-sections --no-entry %t.bulk-mem.o -o %t.bulk-mem.wasm ; RUN: obj2yaml %t.bulk-mem.wasm | FileCheck %s --check-prefixes ACTIVE,ACTIVE32 -; bulk memory, unshared memory, wasm64 => active segments +;; bulk memory, unshared memory, wasm64 => active segments ; RUN: wasm-ld -mwasm64 -no-gc-sections --no-entry %t.bulk-mem64.o -o %t.bulk-mem64.wasm ; RUN: obj2yaml %t.bulk-mem64.wasm | FileCheck %s --check-prefixes ACTIVE,ACTIVE64 -; atomics, bulk memory, shared memory => passive segments +;; atomics, bulk memory, shared memory => passive segments ; RUN: wasm-ld -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.o -o %t.atomics.bulk-mem.wasm ; RUN: obj2yaml %t.atomics.bulk-mem.wasm | FileCheck %s --check-prefix PASSIVE ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.atomics.bulk-mem.wasm | FileCheck %s --check-prefixes DIS,NOPIC-DIS -DPTR=i32 -; atomics, bulk memory, shared memory, wasm64 => passive segments +;; atomics, bulk memory, shared memory, wasm64 => passive segments ; RUN: wasm-ld -mwasm64 -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem64.o -o %t.atomics.bulk-mem64.wasm ; RUN: obj2yaml %t.atomics.bulk-mem64.wasm | FileCheck %s --check-prefix PASSIVE ; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.atomics.bulk-mem64.wasm | FileCheck %s --check-prefixes DIS,NOPIC-DIS -DPTR=i64 -; Also test in combination with PIC/pie +;; Also test in combination with PIC/pie ; RUN: wasm-ld --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic.o -o %t.pic.wasm ; RUN: obj2yaml %t.pic.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE32-PIC -; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i32 +; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i32 -; Also test in combination with PIC/pie + wasm64 +;; Also test in combination with PIC/pie + wasm64 ; RUN: wasm-ld -mwasm64 --experimental-pic -pie -no-gc-sections --no-entry --shared-memory --max-memory=131072 %t.atomics.bulk-mem.pic-mem64.o -o %t.pic-mem64.wasm ; RUN: obj2yaml %t.pic-mem64.wasm | FileCheck %s --check-prefixes PASSIVE-PIC,PASSIVE64-PIC -; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_apply_data_relocs,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic-mem64.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i64 +; RUN: llvm-objdump --disassemble-symbols=__wasm_call_ctors,__wasm_init_memory --no-show-raw-insn --no-leading-addr %t.pic-mem64.wasm | FileCheck %s --check-prefixes DIS,PIC-DIS -DPTR=i64 @a = hidden global [6 x i8] c"hello\00", align 1 @b = hidden global [8 x i8] c"goodbye\00", align 1 @@ -151,7 +151,7 @@ ; PASSIVE-PIC-NEXT: - Index: 2 ; PASSIVE-PIC-NEXT: Name: __wasm_init_memory -; no data relocations. +;; no data relocations. ; DIS-LABEL: <__wasm_call_ctors>: ; DIS-EMPTY: ; DIS-NEXT: end diff --git a/llvm/include/llvm/Analysis/DXILResource.h b/llvm/include/llvm/Analysis/DXILResource.h index 87c5615c28ee0..d4b1a9e2ca340 100644 --- a/llvm/include/llvm/Analysis/DXILResource.h +++ b/llvm/include/llvm/Analysis/DXILResource.h @@ -446,6 +446,13 @@ class DXILBindingMap { return Pos == CallMap.end() ? Infos.end() : (Infos.begin() + Pos->second); } + /// Resolves a resource handle into a vector of ResourceBindingInfos that + /// represent the possible unique creations of the handle. Certain cases are + /// ambiguous so multiple creation instructions may be returned. The resulting + /// ResourceBindingInfo can be used to depuplicate unique handles that + /// reference the same resource + SmallVector findByUse(const Value *Key) const; + const_iterator find(const CallInst *Key) const { auto Pos = CallMap.find(Key); return Pos == CallMap.end() ? Infos.end() : (Infos.begin() + Pos->second); diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h index b25f96571741e..ede51c28fc94d 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/DependencyGraph.h @@ -123,6 +123,10 @@ class DGNode { --UnscheduledSuccs; } void incrUnscheduledSuccs() { ++UnscheduledSuccs; } + void resetScheduleState() { + UnscheduledSuccs = 0; + Scheduled = false; + } /// \Returns true if all dependent successors have been scheduled. bool ready() const { return UnscheduledSuccs == 0; } /// \Returns true if this node has been scheduled. diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h index 6c2315af0e797..6b56f348f328c 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Scheduler.h @@ -61,7 +61,18 @@ class ReadyListContainer { public: ReadyListContainer() : List(Cmp) {} - void insert(DGNode *N) { List.push(N); } + void insert(DGNode *N) { +#ifndef NDEBUG + assert(!N->scheduled() && "Don't insert a scheduled node!"); + auto ListCopy = List; + while (!ListCopy.empty()) { + DGNode *Top = ListCopy.top(); + ListCopy.pop(); + assert(Top != N && "Node already exists in ready list!"); + } +#endif + List.push(N); + } DGNode *pop() { auto *Back = List.top(); List.pop(); diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp index 7f28e63cc117d..4ffc9dbebda8d 100644 --- a/llvm/lib/Analysis/DXILResource.cpp +++ b/llvm/lib/Analysis/DXILResource.cpp @@ -770,6 +770,45 @@ void DXILBindingMap::print(raw_ostream &OS, DXILResourceTypeMap &DRTM, } } +SmallVector +DXILBindingMap::findByUse(const Value *Key) const { + if (const PHINode *Phi = dyn_cast(Key)) { + SmallVector Children; + for (const Value *V : Phi->operands()) { + Children.append(findByUse(V)); + } + return Children; + } + + const CallInst *CI = dyn_cast(Key); + if (!CI) + return {}; + + switch (CI->getIntrinsicID()) { + // Found the create, return the binding + case Intrinsic::dx_resource_handlefrombinding: { + const auto *It = find(CI); + assert(It != Infos.end() && "HandleFromBinding must be in resource map"); + return {*It}; + } + default: + break; + } + + // Check if any of the parameters are the resource we are following. If so + // keep searching. If none of them are return an empty list + const Type *UseType = CI->getType(); + SmallVector Children; + for (const Value *V : CI->args()) { + if (V->getType() != UseType) + continue; + + Children.append(findByUse(V)); + } + + return Children; +} + //===----------------------------------------------------------------------===// AnalysisKey DXILResourceTypeAnalysis::Key; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp index c0581e491720d..3159b497a1ecb 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp @@ -192,8 +192,7 @@ class AMDGPUSwLowerLDS { void getLDSMemoryInstructions(Function *Func, SetVector &LDSInstructions); void replaceKernelLDSAccesses(Function *Func); - Value *getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, - Value *LDSPtr); + Value *getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr); void translateLDSMemoryOperationsToGlobalMemory( Function *Func, Value *LoadMallocPtr, SetVector &LDSInstructions); @@ -655,20 +654,30 @@ void AMDGPUSwLowerLDS::getLDSMemoryInstructions( } else if (AtomicCmpXchgInst *XCHG = dyn_cast(&Inst)) { if (XCHG->getPointerAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) LDSInstructions.insert(&Inst); + } else if (AddrSpaceCastInst *ASC = dyn_cast(&Inst)) { + if (ASC->getSrcAddressSpace() == AMDGPUAS::LOCAL_ADDRESS && + ASC->getDestAddressSpace() == AMDGPUAS::FLAT_ADDRESS) + LDSInstructions.insert(&Inst); } else continue; } } } -Value * -AMDGPUSwLowerLDS::getTranslatedGlobalMemoryGEPOfLDSPointer(Value *LoadMallocPtr, +Value *AMDGPUSwLowerLDS::getTranslatedGlobalMemoryPtrOfLDS(Value *LoadMallocPtr, Value *LDSPtr) { assert(LDSPtr && "Invalid LDS pointer operand"); - Value *PtrToInt = IRB.CreatePtrToInt(LDSPtr, IRB.getInt32Ty()); - Value *GEP = - IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {PtrToInt}); - return GEP; + Type *LDSPtrType = LDSPtr->getType(); + LLVMContext &Ctx = M.getContext(); + const DataLayout &DL = M.getDataLayout(); + Type *IntTy = DL.getIntPtrType(Ctx, AMDGPUAS::LOCAL_ADDRESS); + if (auto *VecPtrTy = dyn_cast(LDSPtrType)) { + // Handle vector of pointers + ElementCount NumElements = VecPtrTy->getElementCount(); + IntTy = VectorType::get(IntTy, NumElements); + } + Value *GepIndex = IRB.CreatePtrToInt(LDSPtr, IntTy); + return IRB.CreateInBoundsGEP(IRB.getInt8Ty(), LoadMallocPtr, {GepIndex}); } void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( @@ -681,7 +690,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( if (LoadInst *LI = dyn_cast(Inst)) { Value *LIOperand = LI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, LIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, LIOperand); LoadInst *NewLI = IRB.CreateAlignedLoad(LI->getType(), Replacement, LI->getAlign(), LI->isVolatile()); NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID()); @@ -691,7 +700,7 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (StoreInst *SI = dyn_cast(Inst)) { Value *SIOperand = SI->getPointerOperand(); Value *Replacement = - getTranslatedGlobalMemoryGEPOfLDSPointer(LoadMallocPtr, SIOperand); + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, SIOperand); StoreInst *NewSI = IRB.CreateAlignedStore( SI->getValueOperand(), Replacement, SI->getAlign(), SI->isVolatile()); NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID()); @@ -701,8 +710,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( } else if (AtomicRMWInst *RMW = dyn_cast(Inst)) { Value *RMWPtrOperand = RMW->getPointerOperand(); Value *RMWValOperand = RMW->getValOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, RMWPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, RMWPtrOperand); AtomicRMWInst *NewRMW = IRB.CreateAtomicRMW( RMW->getOperation(), Replacement, RMWValOperand, RMW->getAlign(), RMW->getOrdering(), RMW->getSyncScopeID()); @@ -712,8 +721,8 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( RMW->eraseFromParent(); } else if (AtomicCmpXchgInst *XCHG = dyn_cast(Inst)) { Value *XCHGPtrOperand = XCHG->getPointerOperand(); - Value *Replacement = getTranslatedGlobalMemoryGEPOfLDSPointer( - LoadMallocPtr, XCHGPtrOperand); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, XCHGPtrOperand); AtomicCmpXchgInst *NewXCHG = IRB.CreateAtomicCmpXchg( Replacement, XCHG->getCompareOperand(), XCHG->getNewValOperand(), XCHG->getAlign(), XCHG->getSuccessOrdering(), @@ -722,6 +731,16 @@ void AMDGPUSwLowerLDS::translateLDSMemoryOperationsToGlobalMemory( AsanInfo.Instructions.insert(NewXCHG); XCHG->replaceAllUsesWith(NewXCHG); XCHG->eraseFromParent(); + } else if (AddrSpaceCastInst *ASC = dyn_cast(Inst)) { + Value *AIOperand = ASC->getPointerOperand(); + Value *Replacement = + getTranslatedGlobalMemoryPtrOfLDS(LoadMallocPtr, AIOperand); + Value *NewAI = IRB.CreateAddrSpaceCast(Replacement, ASC->getType()); + // Note: No need to add the instruction to AsanInfo instructions to be + // instrumented list. FLAT_ADDRESS ptr would have been already + // instrumented by asan pass prior to this pass. + ASC->replaceAllUsesWith(NewAI); + ASC->eraseFromParent(); } else report_fatal_error("Unimplemented LDS lowering instruction"); } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index ceab6c9dcca34..7dace11d208a0 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -5931,11 +5931,15 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, if (!MO) MO = &MI.getOperand(OpIdx); - const MachineOperand *UsedLiteral = nullptr; + const bool IsInlineConst = !MO->isReg() && isInlineConstant(*MO, OpInfo); - int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); - int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; - if (isVALU(MI) && usesConstantBus(MRI, *MO, OpInfo)) { + if (isVALU(MI) && !IsInlineConst && usesConstantBus(MRI, *MO, OpInfo)) { + const MachineOperand *UsedLiteral = nullptr; + + int ConstantBusLimit = ST.getConstantBusLimit(MI.getOpcode()); + int LiteralLimit = !isVOP3(MI) || ST.hasVOP3Literal() ? 1 : 0; + + // TODO: Be more permissive with frame indexes. if (!MO->isReg() && !isInlineConstant(*MO, OpInfo)) { if (!LiteralLimit--) return false; @@ -5974,9 +5978,19 @@ bool SIInstrInfo::isOperandLegal(const MachineInstr &MI, unsigned OpIdx, return false; } } - } else if (ST.hasNoF16PseudoScalarTransInlineConstants() && !MO->isReg() && - isF16PseudoScalarTrans(MI.getOpcode()) && - isInlineConstant(*MO, OpInfo)) { + } else if (!IsInlineConst && !MO->isReg() && isSALU(MI)) { + // There can be at most one literal operand, but it can be repeated. + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + if (i == OpIdx) + continue; + const MachineOperand &Op = MI.getOperand(i); + if (!Op.isReg() && !Op.isFI() && + !isInlineConstant(Op, InstDesc.operands()[i]) && + !Op.isIdenticalTo(*MO)) + return false; + } + } else if (IsInlineConst && ST.hasNoF16PseudoScalarTransInlineConstants() && + isF16PseudoScalarTrans(MI.getOpcode())) { return false; } diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index 21898da1912f5..d5c6e8af109f4 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -1151,7 +1151,7 @@ let isCommutable = 1, isReMaterializable = 1 in { defm V_PK_ADD_F32 : VOP3PInst<"v_pk_add_f32", VOP3P_Profile, any_fadd>; } // End SubtargetPredicate = HasPackedFP32Ops - let SubtargetPredicate = HasPkMovB32 in + let SubtargetPredicate = HasPkMovB32, isAsCheapAsAMove = 1 in defm V_PK_MOV_B32 : VOP3PInst<"v_pk_mov_b32", VOP3P_Profile>; } // End isCommutable = 1, isReMaterializable = 1 diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp index e5a98598370ec..66b989a84b1ce 100644 --- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp +++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp @@ -1092,6 +1092,10 @@ static bool isSupportedInstr(const MachineInstr &MI) { case RISCV::VFWNMSAC_VF: case RISCV::VFWMACCBF16_VV: case RISCV::VFWMACCBF16_VF: + // Vector Floating-Point Square-Root Instruction + case RISCV::VFSQRT_V: + // Vector Floating-Point Reciprocal Square-Root Estimate Instruction + case RISCV::VFRSQRT7_V: // Vector Floating-Point MIN/MAX Instructions case RISCV::VFMIN_VF: case RISCV::VFMIN_VV: diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp index 3e9fc31d7bfc2..62854ea896179 100644 --- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp +++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp @@ -525,7 +525,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToIntPairReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_IntReg); unsigned regIdx = 32; if (Reg >= Sparc::G0 && Reg <= Sparc::G7) @@ -544,7 +544,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToDoubleReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_FloatReg); unsigned regIdx = Reg - Sparc::F0; if (regIdx % 2 || regIdx > 31) @@ -555,7 +555,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToQuadReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); unsigned regIdx = 0; switch (Op.Reg.Kind) { default: llvm_unreachable("Unexpected register kind!"); @@ -578,7 +578,7 @@ class SparcOperand : public MCParsedAsmOperand { } static bool MorphToCoprocPairReg(SparcOperand &Op) { - unsigned Reg = Op.getReg(); + MCRegister Reg = Op.getReg(); assert(Op.Reg.Kind == rk_CoprocReg); unsigned regIdx = 32; if (Reg >= Sparc::C0 && Reg <= Sparc::C31) @@ -592,7 +592,7 @@ class SparcOperand : public MCParsedAsmOperand { static std::unique_ptr MorphToMEMrr(unsigned Base, std::unique_ptr Op) { - unsigned offsetReg = Op->getReg(); + MCRegister offsetReg = Op->getReg(); Op->Kind = k_MemoryReg; Op->Mem.Base = Base; Op->Mem.OffsetReg = offsetReg; diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp index 37503f4bc2ae2..f2a61c95fefb5 100644 --- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp +++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp @@ -66,12 +66,12 @@ bool SparcInstPrinter::printSparcAliasInstr(const MCInst *MI, return false; if (!MI->getOperand(0).isReg()) return false; - switch (MI->getOperand(0).getReg()) { + switch (MI->getOperand(0).getReg().id()) { default: return false; case SP::G0: // jmp $addr | ret | retl if (MI->getOperand(2).isImm() && MI->getOperand(2).getImm() == 8) { - switch(MI->getOperand(1).getReg()) { + switch (MI->getOperand(1).getReg().id()) { default: break; case SP::I7: O << "\tret"; return true; case SP::O7: O << "\tretl"; return true; @@ -115,7 +115,7 @@ void SparcInstPrinter::printOperand(const MCInst *MI, int opNum, const MCOperand &MO = MI->getOperand (opNum); if (MO.isReg()) { - unsigned Reg = MO.getReg(); + MCRegister Reg = MO.getReg(); if (isV9(STI)) printRegName(O, Reg, SP::RegNamesStateReg); else diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td index 72b103b0bb0c5..cf164acba9ec0 100644 --- a/llvm/lib/Target/X86/X86CallingConv.td +++ b/llvm/lib/Target/X86/X86CallingConv.td @@ -267,19 +267,19 @@ def RetCC_X86Common : CallingConv<[ // Vector types are returned in XMM0 and XMM1, when they fit. XMM2 and XMM3 // can only be used by ABI non-compliant code. If the target doesn't have XMM // registers, it won't have vector types. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToReg<[XMM0,XMM1,XMM2,XMM3]>>, // 256-bit vectors are returned in YMM0 and XMM1, when they fit. YMM2 and YMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX target feature. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToReg<[YMM0,YMM1,YMM2,YMM3]>>, // 512-bit vectors are returned in ZMM0 and ZMM1, when they fit. ZMM2 and ZMM3 // can only be used by ABI non-compliant code. This vector type is only // supported while using the AVX-512 target feature. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToReg<[ZMM0,ZMM1,ZMM2,ZMM3]>>, // Long double types are always returned in FP0 (even with SSE), @@ -565,7 +565,7 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[v64i1], CCPromoteToType>, // The first 8 FP/Vector arguments are passed in XMM registers. - CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[f16, f32, f64, f128, v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCIfSubtarget<"hasSSE1()", CCAssignToReg<[XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7]>>>, @@ -574,13 +574,13 @@ def CC_X86_64_C : CallingConv<[ // FIXME: This isn't precisely correct; the x86-64 ABI document says that // fixed arguments to vararg functions are supposed to be passed in // registers. Actually modeling that would be a lot of work, though. - CCIfNotVarArg>>>, // The first 8 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>>, @@ -593,14 +593,14 @@ def CC_X86_64_C : CallingConv<[ CCIfType<[f80, f128], CCAssignToStack<0, 0>>, // Vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCAssignToStack<16, 16>>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -631,13 +631,13 @@ def CC_X86_Win64_C : CallingConv<[ CCIfCFGuardTarget>, // 128 bit vectors are passed by pointer - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect>, + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCPassIndirect>, // 256 bit vectors are passed by pointer - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect>, + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCPassIndirect>, // 512 bit vectors are passed by pointer - CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect>, + CCIfType<[v64i8, v32i16, v16i32, v32f16, v32bf16, v16f32, v8f64, v8i64], CCPassIndirect>, // Long doubles are passed by pointer CCIfType<[f80], CCPassIndirect>, @@ -734,15 +734,15 @@ def CC_X86_64_AnyReg : CallingConv<[ /// values are spilled on the stack. def CC_X86_32_Vector_Common : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 16-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 16>>, // 256-bit AVX vectors get 32-byte stack slots that are 32-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 32>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 64-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 64>> ]>; @@ -750,15 +750,15 @@ def CC_X86_32_Vector_Common : CallingConv<[ /// values are spilled on the stack. def CC_X86_Win32_Vector : CallingConv<[ // Other SSE vectors get 16-byte stack slots that are 4-byte aligned. - CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], + CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v8bf16, v4f32, v2f64], CCAssignToStack<16, 4>>, // 256-bit AVX vectors get 32-byte stack slots that are 4-byte aligned. - CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], + CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v16bf16, v8f32, v4f64], CCAssignToStack<32, 4>>, // 512-bit AVX 512-bit vectors get 64-byte stack slots that are 4-byte aligned. - CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v16f32, v8f64], + CCIfType<[v64i8, v32i16, v16i32, v8i64, v32f16, v32bf16, v16f32, v8f64], CCAssignToStack<64, 4>> ]>; @@ -766,16 +766,16 @@ def CC_X86_Win32_Vector : CallingConv<[ // vector registers def CC_X86_32_Vector_Standard : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCIfIsVarArgOnWin>, @@ -786,16 +786,16 @@ def CC_X86_32_Vector_Standard : CallingConv<[ // vector registers. def CC_X86_32_Vector_Darwin : CallingConv<[ // SSE vector arguments are passed in XMM registers. - CCIfNotVarArg>>, // AVX 256-bit vector arguments are passed in YMM registers. - CCIfNotVarArg>>>, // AVX 512-bit vector arguments are passed in ZMM registers. - CCIfNotVarArg>>, CCDelegateTo diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 2e14145aef884..cf38fc5f058f2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -829,7 +829,10 @@ static Value *foldSelectICmpAndBinOp(const ICmpInst *IC, Value *TrueVal, if (NeedXor) V = Builder.CreateXor(V, *C2); - return Builder.CreateBinOp(BinOp->getOpcode(), Y, V); + auto *Res = Builder.CreateBinOp(BinOp->getOpcode(), Y, V); + if (auto *BO = dyn_cast(Res)) + BO->copyIRFlags(BinOp); + return Res; } /// Canonicalize a set or clear of a masked set of constant bits to diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp index 2f7d7087ca880..3e37e07aabc5c 100644 --- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp +++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Scheduler.cpp @@ -77,12 +77,12 @@ void Scheduler::scheduleAndUpdateReadyList(SchedBundle &Bndl) { // Set nodes as "scheduled" and decrement the UnsceduledSuccs counter of all // dependency predecessors. for (DGNode *N : Bndl) { - N->setScheduled(true); for (auto *DepN : N->preds(DAG)) { DepN->decrUnscheduledSuccs(); - if (DepN->ready()) + if (DepN->ready() && !DepN->scheduled()) ReadyList.insert(DepN); } + N->setScheduled(true); } } @@ -188,6 +188,19 @@ Scheduler::getBndlSchedState(ArrayRef Instrs) const { } void Scheduler::trimSchedule(ArrayRef Instrs) { + // | Legend: N: DGNode + // N <- DAGInterval.top() | B: SchedBundle + // N | *: Contains instruction in Instrs + // B <- TopI (Top of schedule) +------------------------------------------- + // B + // B * + // B + // B * <- LowestI (Lowest in Instrs) + // B + // N + // N + // N <- DAGInterval.bottom() + // Instruction *TopI = &*ScheduleTopItOpt.value(); Instruction *LowestI = VecUtils::getLowest(Instrs); // Destroy the schedule bundles from LowestI all the way to the top. @@ -199,13 +212,28 @@ void Scheduler::trimSchedule(ArrayRef Instrs) { if (auto *SB = N->getSchedBundle()) eraseBundle(SB); } - // TODO: For now we clear the DAG. Trim view once it gets implemented. - Bndls.clear(); - DAG.clear(); - - // Since we are scheduling NewRegion from scratch, we clear the ready lists. - // The nodes currently in the list may not be ready after clearing the View. + // The DAG Nodes contain state like the number of UnscheduledSuccs and the + // Scheduled flag. We need to reset their state. We need to do this for all + // nodes from LowestI to the top of the schedule. DAG Nodes that are above the + // top of schedule that depend on nodes that got reset need to have their + // UnscheduledSuccs adjusted. + Interval ResetIntvl(TopI, LowestI); + for (Instruction &I : ResetIntvl) { + auto *N = DAG.getNode(&I); + N->resetScheduleState(); + // Recompute UnscheduledSuccs for nodes not only in ResetIntvl but even for + // nodes above the top of schedule. + for (auto *PredN : N->preds(DAG)) + PredN->incrUnscheduledSuccs(); + } + // Refill the ready list by visiting all nodes from the top of DAG to LowestI. ReadyList.clear(); + Interval RefillIntvl(DAG.getInterval().top(), LowestI); + for (Instruction &I : RefillIntvl) { + auto *N = DAG.getNode(&I); + if (N->ready()) + ReadyList.insert(N); + } } bool Scheduler::trySchedule(ArrayRef Instrs) { @@ -214,6 +242,12 @@ bool Scheduler::trySchedule(ArrayRef Instrs) { return I->getParent() == (*Instrs.begin())->getParent(); }) && "Instrs not in the same BB, should have been rejected by Legality!"); + // TODO: For now don't cross BBs. + if (!DAG.getInterval().empty()) { + auto *BB = DAG.getInterval().top()->getParent(); + if (any_of(Instrs, [BB](auto *I) { return I->getParent() != BB; })) + return false; + } if (ScheduledBB == nullptr) ScheduledBB = Instrs[0]->getParent(); // We don't support crossing BBs for now. @@ -230,21 +264,13 @@ bool Scheduler::trySchedule(ArrayRef Instrs) { // top-most part of the schedule that includes the instrs in the bundle and // re-schedule. trimSchedule(Instrs); - ScheduleTopItOpt = std::nullopt; - [[fallthrough]]; + ScheduleTopItOpt = std::next(VecUtils::getLowest(Instrs)->getIterator()); + return tryScheduleUntil(Instrs); case BndlSchedState::NoneScheduled: { // TODO: Set the window of the DAG that we are interested in. if (!ScheduleTopItOpt) // We start scheduling at the bottom instr of Instrs. ScheduleTopItOpt = std::next(VecUtils::getLowest(Instrs)->getIterator()); - - // TODO: For now don't cross BBs. - if (!DAG.getInterval().empty()) { - auto *BB = DAG.getInterval().top()->getParent(); - if (any_of(Instrs, [BB](auto *I) { return I->getParent() != BB; })) - return false; - } - // Extend the DAG to include Instrs. Interval Extension = DAG.extend(Instrs); // Add nodes to ready list. diff --git a/llvm/test/Analysis/ProfileSummary/basic.ll b/llvm/test/Analysis/ProfileSummary/basic.ll index c4f48ccafde86..0385c3a921c01 100644 --- a/llvm/test/Analysis/ProfileSummary/basic.ll +++ b/llvm/test/Analysis/ProfileSummary/basic.ll @@ -7,9 +7,9 @@ define void @f1() !prof !20 { ; CHECK-LABEL: f1 :hot -; OVERRIDE-HOT-LABEL: f1 +; OVERRIDE-HOT-LABEL: f1{{$}} ; OVERRIDE-COLD-LABEL: f1 :hot -; OVERRIDE-BOTH-LABEL: f1 +; OVERRIDE-BOTH-LABEL: f1{{$}} ; HOT-CUTOFF-0-LABEL: f1{{$}} ; COLD-CUTOFF-0-LABEL: f1 :cold @@ -19,8 +19,8 @@ define void @f1() !prof !20 { define void @f2() !prof !21 { ; CHECK-LABEL: f2 :cold ; OVERRIDE-HOT-LABEL: f2 :cold -; OVERRIDE-COLD-LABEL: f2 -; OVERRIDE-BOTH-LABEL: f2 +; OVERRIDE-COLD-LABEL: f2{{$}} +; OVERRIDE-BOTH-LABEL: f2 :cold ; HOT-CUTOFF-0-LABEL: f2 :cold ; COLD-CUTOFF-0-LABEL: f2 :cold @@ -28,10 +28,10 @@ define void @f2() !prof !21 { } define void @f3() !prof !22 { -; CHECK-LABEL: f3 -; OVERRIDE-HOT-LABEL: f3 -; OVERRIDE-COLD-LABEL: f3 -; OVERRIDE-BOTH-LABEL: f3 +; CHECK-LABEL: f3 :hot +; OVERRIDE-HOT-LABEL: f3{{$}} +; OVERRIDE-COLD-LABEL: f3 :hot +; OVERRIDE-BOTH-LABEL: f3 :cold ; HOT-CUTOFF-0-LABEL: f3{{$}} ; COLD-CUTOFF-0-LABEL: f3 :cold diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll index ae2bcbbb81b5f..a6e6b84bba304 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-non-kernel-declaration.ll @@ -20,8 +20,12 @@ define void @non_kernel_function() sanitize_address { ; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 ; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 ; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] -; CHECK-NEXT: [[Y:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: [[TMP12:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP12]] +; CHECK-NEXT: [[TMP9:%.*]] = addrspacecast ptr addrspace(1) [[TMP13]] to ptr ; CHECK-NEXT: store i8 5, ptr [[TMP9]], align 8 ; CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll index 3a05f93df35a3..b9b4c90daea87 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP33:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP34:%.*]] = addrspacecast ptr addrspace(1) [[TMP33]] to ptr +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP36:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP35]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP36]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -45,16 +49,16 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP25:%.*]] = and i1 [[TMP21]], [[TMP24]] ; CHECK-NEXT: [[TMP26:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP25]]) ; CHECK-NEXT: [[TMP27:%.*]] = icmp ne i64 [[TMP26]], 0 -; CHECK-NEXT: br i1 [[TMP27]], label [[ASAN_REPORT:%.*]], label [[TMP30:%.*]], !prof [[PROF2:![0-9]+]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP25]], label [[TMP28:%.*]], label [[TMP29:%.*]] -; CHECK: 28: +; CHECK-NEXT: br i1 [[TMP27]], label %[[ASAN_REPORT:.*]], label %[[BB35:.*]], !prof [[PROF2:![0-9]+]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP25]], label %[[BB33:.*]], label %[[BB34:.*]] +; CHECK: [[BB33]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7:[0-9]+]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP29]] -; CHECK: 29: -; CHECK-NEXT: br label [[TMP30]] -; CHECK: 30: +; CHECK-NEXT: br label %[[BB34]] +; CHECK: [[BB34]]: +; CHECK-NEXT: br label %[[BB35]] +; CHECK: [[BB35]]: ; CHECK-NEXT: store i8 3, ptr addrspace(1) [[TMP31]], align 8 ; CHECK-NEXT: ret void ; @@ -67,15 +71,15 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( ; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -100,9 +104,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -124,16 +128,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP41:%.*]] = and i1 [[TMP37]], [[TMP40]] ; CHECK-NEXT: [[TMP42:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP41]]) ; CHECK-NEXT: [[TMP43:%.*]] = icmp ne i64 [[TMP42]], 0 -; CHECK-NEXT: br i1 [[TMP43]], label [[ASAN_REPORT:%.*]], label [[TMP46:%.*]], !prof [[PROF2]] -; CHECK: asan.report: -; CHECK-NEXT: br i1 [[TMP41]], label [[TMP44:%.*]], label [[CONDFREE:%.*]] -; CHECK: 44: +; CHECK-NEXT: br i1 [[TMP43]], label %[[ASAN_REPORT:.*]], label %[[BB46:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT]]: +; CHECK-NEXT: br i1 [[TMP41]], label %[[BB44:.*]], label %[[BB45:.*]] +; CHECK: [[BB44]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP32]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[CONDFREE]] -; CHECK: 45: -; CHECK-NEXT: br label [[TMP46]] -; CHECK: 46: +; CHECK-NEXT: br label %[[BB45]] +; CHECK: [[BB45]]: +; CHECK-NEXT: br label %[[BB46]] +; CHECK: [[BB46]]: ; CHECK-NEXT: store i8 7, ptr addrspace(1) [[TMP31]], align 1 ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] @@ -152,16 +156,16 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP59:%.*]] = and i1 [[TMP54]], [[TMP58]] ; CHECK-NEXT: [[TMP60:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP59]]) ; CHECK-NEXT: [[TMP61:%.*]] = icmp ne i64 [[TMP60]], 0 -; CHECK-NEXT: br i1 [[TMP61]], label [[ASAN_REPORT1:%.*]], label [[TMP64:%.*]], !prof [[PROF2]] -; CHECK: asan.report1: -; CHECK-NEXT: br i1 [[TMP59]], label [[TMP62:%.*]], label [[TMP63:%.*]] -; CHECK: 64: +; CHECK-NEXT: br i1 [[TMP61]], label %[[ASAN_REPORT1:.*]], label %[[BB66:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT1]]: +; CHECK-NEXT: br i1 [[TMP59]], label %[[BB64:.*]], label %[[BB65:.*]] +; CHECK: [[BB64]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP83]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP63]] -; CHECK: 65: -; CHECK-NEXT: br label [[TMP64]] -; CHECK: 66: +; CHECK-NEXT: br label %[[BB65]] +; CHECK: [[BB65]]: +; CHECK-NEXT: br label %[[BB66]] +; CHECK: [[BB66]]: ; CHECK-NEXT: [[TMP84:%.*]] = ptrtoint ptr addrspace(1) [[TMP82]] to i64 ; CHECK-NEXT: [[TMP85:%.*]] = lshr i64 [[TMP84]], 3 ; CHECK-NEXT: [[TMP69:%.*]] = add i64 [[TMP85]], 2147450880 @@ -174,28 +178,28 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP76:%.*]] = and i1 [[TMP72]], [[TMP75]] ; CHECK-NEXT: [[TMP77:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[TMP76]]) ; CHECK-NEXT: [[TMP78:%.*]] = icmp ne i64 [[TMP77]], 0 -; CHECK-NEXT: br i1 [[TMP78]], label [[ASAN_REPORT2:%.*]], label [[TMP81:%.*]], !prof [[PROF2]] -; CHECK: asan.report2: -; CHECK-NEXT: br i1 [[TMP76]], label [[TMP79:%.*]], label [[TMP80:%.*]] -; CHECK: 79: +; CHECK-NEXT: br i1 [[TMP78]], label %[[ASAN_REPORT2:.*]], label %[[BB81:.*]], !prof [[PROF2]] +; CHECK: [[ASAN_REPORT2]]: +; CHECK-NEXT: br i1 [[TMP76]], label %[[BB79:.*]], label %[[BB80:.*]] +; CHECK: [[BB79]]: ; CHECK-NEXT: call void @__asan_report_store1(i64 [[TMP84]]) #[[ATTR7]] ; CHECK-NEXT: call void @llvm.amdgcn.unreachable() -; CHECK-NEXT: br label [[TMP80]] -; CHECK: 80: -; CHECK-NEXT: br label [[TMP81]] -; CHECK: 81: +; CHECK-NEXT: br label %[[BB80]] +; CHECK: [[BB80]]: +; CHECK-NEXT: br label %[[BB81]] +; CHECK: [[BB81]]: ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll index 1dd391ec6321a..255dda562c1ea 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested-asan.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,50 +6,64 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,56 +72,56 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,48 +130,48 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP13:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr [[TMP13]] to i64 -; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP11]], i64 [[TMP14]]) -; CHECK-NEXT: [[TMP6:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP6]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 -; CHECK-NEXT: [[TMP20:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 8 -; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP21]], i64 24) -; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 -; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 96 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 32) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 -; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,56 +180,56 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 -; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 -; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] -; CHECK-NEXT: [[TMP6:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP6]], i64 15 -; CHECK-NEXT: store i32 [[TMP21]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(4) [[TMP7]], align 4 -; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 -; CHECK-NEXT: [[TMP9:%.*]] = add i32 [[TMP8]], 3 -; CHECK-NEXT: [[TMP10:%.*]] = udiv i32 [[TMP9]], 4 -; CHECK-NEXT: [[TMP11:%.*]] = mul i32 [[TMP10]], 4 -; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 -; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP21]], [[TMP11]] +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr() +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(4), ptr addrspace(4) [[TMP9]], i64 15 +; CHECK-NEXT: store i32 [[TMP8]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(4) [[TMP10]], align 4 +; CHECK-NEXT: store i32 [[TMP11]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 1), align 4 +; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP11]], 3 +; CHECK-NEXT: [[TMP13:%.*]] = udiv i32 [[TMP12]], 4 +; CHECK-NEXT: [[TMP14:%.*]] = mul i32 [[TMP13]], 4 +; CHECK-NEXT: store i32 [[TMP14]], ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP15:%.*]] = add i32 [[TMP8]], [[TMP14]] ; CHECK-NEXT: [[TMP16:%.*]] = zext i32 [[TMP15]] to i64 ; CHECK-NEXT: [[TMP17:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr [[TMP17]] to i64 ; CHECK-NEXT: [[TMP19:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP16]], i64 [[TMP18]]) -; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) -; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 -; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 -; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 -; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: [[TMP20:%.*]] = inttoptr i64 [[TMP19]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP20]], ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], i64 8 +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP21]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP22]], i64 24) +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 +; CHECK-NEXT: [[TMP24:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: -; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) -; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 -; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 -; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP24]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -237,14 +251,16 @@ define private void @store_A() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: store ptr [[TMP11]], ptr null, align 8 ; CHECK-NEXT: ret void ; store ptr addrspacecast (ptr addrspace(3) @A to ptr), ptr null @@ -256,14 +272,16 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-SAME: ) #[[ATTR2]] { ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() ; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [4 x ptr addrspace(3)], ptr addrspace(1) @llvm.amdgcn.sw.lds.base.table, i32 0, i32 [[TMP1]] -; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP4]], align 8 -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 -; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: ret ptr [[TMP10]] +; CHECK-NEXT: [[TMP3:%.*]] = load ptr addrspace(3), ptr addrspace(1) [[TMP2]], align 4 +; CHECK-NEXT: [[TMP4:%.*]] = load ptr addrspace(1), ptr addrspace(3) [[TMP3]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds [4 x [2 x ptr addrspace(1)]], ptr addrspace(1) @llvm.amdgcn.sw.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP5]], align 8 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) [[TMP6]], align 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP9]] +; CHECK-NEXT: [[TMP11:%.*]] = addrspacecast ptr addrspace(1) [[TMP10]] to ptr +; CHECK-NEXT: ret ptr [[TMP11]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) } @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll index ed9107764eb91..7184ebbb8faa3 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-nested.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if LDS accesses are lowered correctly when a call is made to nested non-kernel. @@ -6,18 +6,32 @@ @A = external addrspace(3) global [8 x ptr] @B = external addrspace(3) global [0 x i32] +;. +; @llvm.amdgcn.sw.lds.kernel_2 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_2.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_2.md.type { %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_2.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_1 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_1.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1:![0-9]+]] +; @llvm.amdgcn.sw.lds.kernel_1.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_1.md.type { %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_1.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_3 = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0]] +; @llvm.amdgcn.kernel_3.dynlds = external addrspace(3) global [0 x i8], no_sanitize_address, align 4, !absolute_symbol [[META1]] +; @llvm.amdgcn.sw.lds.kernel_3.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_3.md.type { %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_3.md.item { i32 32, i32 0, i32 32 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.kernel_0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 8, !absolute_symbol [[META0]] +; @llvm.amdgcn.sw.lds.kernel_0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.kernel_0.md.type { %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.kernel_0.md.item { i32 32, i32 64, i32 96 } }, no_sanitize_address +; @llvm.amdgcn.sw.lds.base.table = internal addrspace(1) constant [4 x ptr addrspace(3)] [ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3], no_sanitize_address +; @llvm.amdgcn.sw.lds.offset.table = internal addrspace(1) constant [4 x [2 x ptr addrspace(1)]] [[2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_0.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_1.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 1, i32 0)], [2 x ptr addrspace(1)] [ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_2.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), ptr addrspace(1) poison], [2 x ptr addrspace(1)] [ptr addrspace(1) poison, ptr addrspace(1) getelementptr inbounds (%llvm.amdgcn.sw.lds.kernel_3.md.type, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 1, i32 0)]], no_sanitize_address +;. define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_0( -; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_0.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -33,23 +47,23 @@ define amdgpu_kernel void @kernel_0() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_0, align 8 ; CHECK-NEXT: call void @call_store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @call_store_A() @@ -58,16 +72,16 @@ define amdgpu_kernel void @kernel_0() sanitize_address { define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_1( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META3:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_1_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_1.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -90,24 +104,24 @@ define amdgpu_kernel void @kernel_1() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_1, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_1.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -116,16 +130,16 @@ define amdgpu_kernel void @kernel_1() sanitize_address { define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_2( -; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META4:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 0), align 4 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_2_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_2.md, i32 0, i32 1, i32 2), align 4 ; CHECK-NEXT: [[TMP12:%.*]] = add i32 [[TMP9]], [[TMP10]] @@ -141,23 +155,23 @@ define amdgpu_kernel void @kernel_2() sanitize_address { ; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 96 ; CHECK-NEXT: [[TMP23:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP23]], i64 32) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 18: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP15:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_2, align 8 ; CHECK-NEXT: call void @store_A() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP16:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr [[TMP16]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(1) [[TMP15]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP18]], i64 [[TMP17]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @store_A() @@ -166,16 +180,16 @@ define amdgpu_kernel void @kernel_2() sanitize_address { define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @kernel_3( -; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META5:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1]] !llvm.amdgcn.lds.kernel.id [[META6:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP14:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB23:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, align 4 ; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_KERNEL_3_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.kernel_3.md, i32 0, i32 0, i32 2), align 4 ; CHECK-NEXT: [[TMP21:%.*]] = add i32 [[TMP12]], [[TMP20]] @@ -198,24 +212,24 @@ define amdgpu_kernel void @kernel_3() sanitize_address { ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 ; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP26]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP27]], i64 24) -; CHECK-NEXT: br label [[TMP14]] -; CHECK: 23: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB23]] +; CHECK: [[BB23]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP22:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.kernel_3, align 8 ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel_3.dynlds) ] ; CHECK-NEXT: [[PTR:%.*]] = call ptr @get_B_ptr() -; CHECK-NEXT: br label [[CONDFREE:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP23:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP24:%.*]] = ptrtoint ptr [[TMP23]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint ptr addrspace(1) [[TMP22]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP25]], i64 [[TMP24]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; %ptr = call ptr @get_B_ptr() @@ -243,7 +257,9 @@ define private void @store_A() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: store ptr [[TMP10]], ptr null, align 8 ; CHECK-NEXT: ret void ; @@ -262,7 +278,9 @@ define private ptr @get_B_ptr() sanitize_address { ; CHECK-NEXT: [[TMP5:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP6]], align 8 ; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr addrspace(1) [[TMP5]], align 4 ; CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP8]] -; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP11]] +; CHECK-NEXT: [[TMP10:%.*]] = addrspacecast ptr addrspace(1) [[TMP12]] to ptr ; CHECK-NEXT: ret ptr [[TMP10]] ; ret ptr addrspacecast (ptr addrspace(3) @B to ptr) @@ -272,8 +290,6 @@ define private ptr @get_B_ptr() sanitize_address { !0 = !{i32 4, !"nosanitize_address", i32 1} ;. -; CHECK: [[META2]] = !{i32 0} -; CHECK: [[META3]] = !{i32 1} -; CHECK: [[META4]] = !{i32 2} -; CHECK: [[META5]] = !{i32 3} +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1]] = { sanitize_address "amdgpu-lds-size"="8,8" } ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll index b9fa89dd6f0a6..704bc9e635294 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access-no-kernel-lds-id.ll @@ -29,8 +29,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP10:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP9]], align 8 ; CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr addrspace(1) [[TMP10]], align 4 ; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP3]], i32 [[TMP11]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr -; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(3) [[TMP8]] to ptr +; CHECK-NEXT: [[TMP18:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP18]] +; CHECK-NEXT: [[TMP20:%.*]] = addrspacecast ptr addrspace(1) [[TMP19]] to ptr +; CHECK-NEXT: [[TMP16:%.*]] = ptrtoint ptr addrspace(3) [[TMP8]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP16]] +; CHECK-NEXT: [[TMP13:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP13]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP12]] to i32 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP4]], i32 [[TMP14]] diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll index 11e912287c7f7..8f5abe962f8eb 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-indirect-access.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 4 +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 ; RUN: opt < %s -passes=amdgpu-sw-lower-lds -amdgpu-asan-instrument-lds=false -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s ; Test to check if static LDS is lowered correctly when a non-kernel with LDS accesses is called from kernel. @@ -28,8 +28,12 @@ define void @use_variables() sanitize_address { ; CHECK-NEXT: [[TMP12:%.*]] = load ptr addrspace(1), ptr addrspace(1) [[TMP11]], align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) [[TMP12]], align 4 ; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds i8, ptr addrspace(3) [[TMP4]], i32 [[TMP10]] -; CHECK-NEXT: [[X:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr -; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(3) [[TMP9]] to ptr +; CHECK-NEXT: [[TMP13:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP13]] +; CHECK-NEXT: [[TMP19:%.*]] = addrspacecast ptr addrspace(1) [[TMP18]] to ptr +; CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr addrspace(3) [[TMP9]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP20]] +; CHECK-NEXT: [[TMP16:%.*]] = addrspacecast ptr addrspace(1) [[TMP17]] to ptr ; CHECK-NEXT: store i8 3, ptr [[TMP16]], align 4 ; CHECK-NEXT: [[TMP14:%.*]] = ptrtoint ptr addrspace(3) [[TMP15]] to i32 ; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP7]], i32 [[TMP14]] @@ -44,16 +48,16 @@ define void @use_variables() sanitize_address { define amdgpu_kernel void @k0() sanitize_address { ; CHECK-LABEL: define amdgpu_kernel void @k0( -; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META1:![0-9]+]] { -; CHECK-NEXT: WId: +; CHECK-SAME: ) #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META2:![0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: ; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() ; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() ; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() ; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] ; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] ; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 -; CHECK-NEXT: br i1 [[TMP5]], label [[MALLOC:%.*]], label [[TMP7:%.*]] -; CHECK: Malloc: +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB24:.*]] +; CHECK: [[MALLOC]]: ; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 0), align 4 ; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 4, i32 2), align 4 ; CHECK-NEXT: [[TMP16:%.*]] = add i32 [[TMP13]], [[TMP14]] @@ -78,9 +82,9 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP67:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP6]], i64 132 ; CHECK-NEXT: [[TMP68:%.*]] = ptrtoint ptr addrspace(1) [[TMP67]] to i64 ; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP68]], i64 28) -; CHECK-NEXT: br label [[TMP7]] -; CHECK: 24: -; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, [[WID:%.*]] ], [ true, [[MALLOC]] ] +; CHECK-NEXT: br label %[[BB24]] +; CHECK: [[BB24]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() ; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 ; CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 @@ -94,17 +98,17 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK-NEXT: [[TMP47:%.*]] = ptrtoint ptr addrspace(3) [[TMP18]] to i32 ; CHECK-NEXT: [[TMP48:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP47]] ; CHECK-NEXT: store i32 8, ptr addrspace(1) [[TMP48]], align 2 -; CHECK-NEXT: br label [[CONDFREE1:%.*]] -; CHECK: CondFree: +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: ; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() -; CHECK-NEXT: br i1 [[XYZCOND]], label [[FREE:%.*]], label [[END:%.*]] -; CHECK: Free: +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: ; CHECK-NEXT: [[TMP20:%.*]] = call ptr @llvm.returnaddress(i32 0) ; CHECK-NEXT: [[TMP21:%.*]] = ptrtoint ptr [[TMP20]] to i64 ; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 ; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP22]], i64 [[TMP21]]) -; CHECK-NEXT: br label [[END]] -; CHECK: End: +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: ; CHECK-NEXT: ret void ; call void @use_variables() @@ -124,5 +128,6 @@ define amdgpu_kernel void @k0() sanitize_address { ; CHECK: attributes #[[ATTR4:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } ;. ; CHECK: [[META0]] = !{i32 0, i32 1} -; CHECK: [[META1]] = !{i32 0} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +; CHECK: [[META2]] = !{i32 0} ;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll new file mode 100644 index 0000000000000..1973a0acf4659 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-O0.ll @@ -0,0 +1,76 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -mtriple=amdgcn-amd-amdhsa | FileCheck %s +@lds = internal addrspace(3) global [5 x i32] poison, align 16 + +;. +; CHECK: @llvm.amdgcn.sw.lds.k0 = internal addrspace(3) global ptr poison, no_sanitize_address, align 16, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.k0.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.k0.md.type { %llvm.amdgcn.sw.lds.k0.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.k0.md.item { i32 32, i32 20, i32 64 } }, no_sanitize_address +;. +define amdgpu_kernel void @k0() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @k0( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[BB18:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 52 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 44) +; CHECK-NEXT: br label %[[BB18]] +; CHECK: [[BB18]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP19:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, align 8 +; CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_K0_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.k0.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP21:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.k0, i32 [[TMP20]] +; CHECK-NEXT: [[TMP22:%.*]] = ptrtoint ptr addrspace(3) [[TMP21]] to i32 +; CHECK-NEXT: [[TMP23:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP19]], i32 [[TMP22]] +; CHECK-NEXT: [[TMP24:%.*]] = addrspacecast ptr addrspace(1) [[TMP23]] to ptr +; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds [5 x i32], ptr [[TMP24]], i64 0, i64 0 +; CHECK-NEXT: store i32 1, ptr [[GEP]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP25:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP26:%.*]] = ptrtoint ptr [[TMP25]] to i64 +; CHECK-NEXT: [[TMP27:%.*]] = ptrtoint ptr addrspace(1) [[TMP19]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP27]], i64 [[TMP26]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; + %gep = getelementptr inbounds [5 x i32], ptr addrspacecast (ptr addrspace(3) @lds to ptr), i64 0, i64 0 + store i32 1, ptr %gep, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="16" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll new file mode 100644 index 0000000000000..34caf91def933 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-sw-lower-lds-static-lds-vector-ptrs.ll @@ -0,0 +1,95 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt < %s -passes=amdgpu-sw-lower-lds -S -amdgpu-asan-instrument-lds=false -mtriple=amdgcn-amd-amdhsa | FileCheck %s + +; Test to check if vector of static LDS ptrs accesses in kernel are lowered correctly. +@lds_var1 = internal addrspace(3) global i32 poison +@lds_var2 = internal addrspace(3) global i32 poison + +;. +; CHECK: @llvm.amdgcn.sw.lds.example = internal addrspace(3) global ptr poison, no_sanitize_address, align 4, !absolute_symbol [[META0:![0-9]+]] +; CHECK: @llvm.amdgcn.sw.lds.example.md = internal addrspace(1) global %llvm.amdgcn.sw.lds.example.md.type { %llvm.amdgcn.sw.lds.example.md.item { i32 0, i32 8, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 32, i32 4, i32 32 }, %llvm.amdgcn.sw.lds.example.md.item { i32 64, i32 4, i32 32 } }, no_sanitize_address +;. +define amdgpu_kernel void @example() sanitize_address { +; CHECK-LABEL: define amdgpu_kernel void @example( +; CHECK-SAME: ) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[WID:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = call i32 @llvm.amdgcn.workitem.id.x() +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.workitem.id.y() +; CHECK-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.workitem.id.z() +; CHECK-NEXT: [[TMP3:%.*]] = or i32 [[TMP0]], [[TMP1]] +; CHECK-NEXT: [[TMP4:%.*]] = or i32 [[TMP3]], [[TMP2]] +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[TMP4]], 0 +; CHECK-NEXT: br i1 [[TMP5]], label %[[MALLOC:.*]], label %[[ENTRY:.*]] +; CHECK: [[MALLOC]]: +; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE:%.*]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 2), align 4 +; CHECK-NEXT: [[TMP8:%.*]] = add i32 [[TMP6]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 +; CHECK-NEXT: [[TMP10:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP11:%.*]] = ptrtoint ptr [[TMP10]] to i64 +; CHECK-NEXT: [[TMP12:%.*]] = call i64 @__asan_malloc_impl(i64 [[TMP9]], i64 [[TMP11]]) +; CHECK-NEXT: [[TMP13:%.*]] = inttoptr i64 [[TMP12]] to ptr addrspace(1) +; CHECK-NEXT: store ptr addrspace(1) [[TMP13]], ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 8 +; CHECK-NEXT: [[TMP15:%.*]] = ptrtoint ptr addrspace(1) [[TMP14]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP15]], i64 24) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 36 +; CHECK-NEXT: [[TMP17:%.*]] = ptrtoint ptr addrspace(1) [[TMP16]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP17]], i64 28) +; CHECK-NEXT: [[TMP18:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP13]], i64 68 +; CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr addrspace(1) [[TMP18]] to i64 +; CHECK-NEXT: call void @__asan_poison_region(i64 [[TMP19]], i64 28) +; CHECK-NEXT: br label %[[ENTRY]] +; CHECK: [[ENTRY]]: +; CHECK-NEXT: [[XYZCOND:%.*]] = phi i1 [ false, %[[WID]] ], [ true, %[[MALLOC]] ] +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: [[TMP20:%.*]] = load ptr addrspace(1), ptr addrspace(3) @llvm.amdgcn.sw.lds.example, align 8 +; CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 1, i32 0), align 4 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP21]] +; CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr addrspace(1) getelementptr inbounds ([[LLVM_AMDGCN_SW_LDS_EXAMPLE_MD_TYPE]], ptr addrspace(1) @llvm.amdgcn.sw.lds.example.md, i32 0, i32 2, i32 0), align 4 +; CHECK-NEXT: [[TMP24:%.*]] = getelementptr inbounds i8, ptr addrspace(3) @llvm.amdgcn.sw.lds.example, i32 [[TMP23]] +; CHECK-NEXT: [[VEC_LDS_PTRS:%.*]] = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) [[TMP22]], i32 0 +; CHECK-NEXT: [[VEC_LDS_PTRS1:%.*]] = insertelement <2 x ptr addrspace(3)> [[VEC_LDS_PTRS]], ptr addrspace(3) [[TMP24]], i32 1 +; CHECK-NEXT: [[TMP25:%.*]] = ptrtoint <2 x ptr addrspace(3)> [[VEC_LDS_PTRS1]] to <2 x i32> +; CHECK-NEXT: [[TMP31:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP20]], <2 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP32:%.*]] = addrspacecast <2 x ptr addrspace(1)> [[TMP31]] to <2 x ptr> +; CHECK-NEXT: [[ELEM0:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 0 +; CHECK-NEXT: store i32 42, ptr [[ELEM0]], align 4 +; CHECK-NEXT: [[ELEM1:%.*]] = extractelement <2 x ptr> [[TMP32]], i32 1 +; CHECK-NEXT: store i32 43, ptr [[ELEM1]], align 4 +; CHECK-NEXT: br label %[[CONDFREE:.*]] +; CHECK: [[CONDFREE]]: +; CHECK-NEXT: call void @llvm.amdgcn.s.barrier() +; CHECK-NEXT: br i1 [[XYZCOND]], label %[[FREE:.*]], label %[[END:.*]] +; CHECK: [[FREE]]: +; CHECK-NEXT: [[TMP33:%.*]] = call ptr @llvm.returnaddress(i32 0) +; CHECK-NEXT: [[TMP34:%.*]] = ptrtoint ptr [[TMP33]] to i64 +; CHECK-NEXT: [[TMP35:%.*]] = ptrtoint ptr addrspace(1) [[TMP20]] to i64 +; CHECK-NEXT: call void @__asan_free_impl(i64 [[TMP35]], i64 [[TMP34]]) +; CHECK-NEXT: br label %[[END]] +; CHECK: [[END]]: +; CHECK-NEXT: ret void +; +entry: + ; Create a vector of flat pointers + %vec_lds_ptrs = insertelement <2 x ptr addrspace(3)> poison, ptr addrspace(3) @lds_var1, i32 0 + %vec_lds_ptrs1 = insertelement <2 x ptr addrspace(3)> %vec_lds_ptrs, ptr addrspace(3) @lds_var2, i32 1 + %vec_flat_ptrs = addrspacecast <2 x ptr addrspace(3)> %vec_lds_ptrs1 to <2 x ptr> + %elem0 = extractelement <2 x ptr> %vec_flat_ptrs, i32 0 + store i32 42, ptr %elem0, align 4 + %elem1 = extractelement <2 x ptr> %vec_flat_ptrs, i32 1 + store i32 43, ptr %elem1, align 4 + ret void +} + +!llvm.module.flags = !{!0} +!0 = !{i32 4, !"nosanitize_address", i32 1} +;. +; CHECK: attributes #[[ATTR0]] = { sanitize_address "amdgpu-lds-size"="8" } +; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { convergent nocallback nofree nounwind willreturn } +;. +; CHECK: [[META0]] = !{i32 0, i32 1} +; CHECK: [[META1:![0-9]+]] = !{i32 4, !"nosanitize_address", i32 1} +;. diff --git a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir index 08693ec9db1d4..2492eb2982aac 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-operands-scalar-fmac.mir @@ -133,7 +133,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 %noninlinable, [[COPY]], 1056964608, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 @@ -152,7 +153,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1234567890, 1056964608, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], %noninlinable, 1056964608, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 @@ -210,7 +212,8 @@ body: | ; CHECK: liveins: $sgpr0 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0 - ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAAK_F32 [[COPY]], 1056964608, 1234567890, implicit $mode + ; CHECK-NEXT: %noninlinable:sreg_32 = S_MOV_B32 1234567890 + ; CHECK-NEXT: %fma:sreg_32 = nofpexcept S_FMAMK_F32 [[COPY]], 1056964608, %noninlinable, implicit $mode ; CHECK-NEXT: $sgpr0 = COPY %fma %0:sreg_32 = COPY $sgpr0 %inlinable:sreg_32 = S_MOV_B32 1056964608 diff --git a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir index 5f985605c082d..c8afb89aa272a 100644 --- a/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-sgpr-multi-imm.mir @@ -69,3 +69,202 @@ body: | %0:sreg_32 = S_MOV_B32 63 %1:sreg_32 = S_ADD_I32 %stack.0, %0, implicit-def $scc ... + +# GCN-LABEL: name: test_no_fold_literal_already_inline_lhs{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 80 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 70, %0 +--- +name: test_no_fold_literal_already_inline_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_no_fold_literal_already_inline_rhs{{$}} +# GCN: %0:sreg_32 = S_MOV_B32 80 +# GCN-NEXT: %1:sreg_32 = S_ADD_I32 %0, 70 +--- +name: test_no_fold_literal_already_inline_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 %0, 70, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_literal_inlineimm_lhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 64, 80 +--- +name: test_fold_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 64, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_literal_inlineimm_rhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 80, 64 +--- +name: test_fold_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + %1:sreg_32 = S_ADD_I32 %0, 64, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_2x{{$}} +# GCN: %2:sreg_32 = S_ADD_I32 70, %1 +--- +name: test_fold_same_literal_2x +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 70 + %2:sreg_32 = S_ADD_I32 %0, %1, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_lhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 70, %0 +--- +name: test_fold_same_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_ADD_I32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_fold_same_literal_rhs{{$}} +# GCN: %1:sreg_32 = S_ADD_I32 %0, 70 +--- +name: test_fold_same_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_ADD_I32 %0, 70, implicit-def $scc +... + + +# GCN-LABEL: name: test_s_cselect_b32_2x_literal_fold{{$}} +# GCN: %2:sreg_32 = S_CSELECT_B32 70, %1, implicit $scc +--- +name: test_s_cselect_b32_2x_literal_fold +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %2:sreg_32 = S_CSELECT_B32 %0, %1, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_literal_lhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 70, %0, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 70, %0, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_literal_rhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 %0, 70, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 %0, 70, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_inlineimm_lhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 64, 80, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 64, %0, implicit $scc +... + +# GCN-LABEL: name: test_s_cselect_b32_fold_literal_inlineimm_rhs{{$}} +# GCN: %1:sreg_32 = S_CSELECT_B32 80, 64, implicit $scc +--- +name: test_s_cselect_b32_fold_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + %1:sreg_32 = S_CSELECT_B32 %0, 64, implicit $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_2x_literal_fold{{$}} +# GCN: S_CMP_EQ_U32 70, %1, implicit-def $scc +--- +name: test_s_cmp_b32_2x_literal_fold +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 70 + %1:sreg_32 = S_MOV_B32 80 + $scc = IMPLICIT_DEF + S_CMP_EQ_U32 %0, %1, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_literal_lhs{{$}} +# GCN: S_CMP_EQ_U32 70, %0, implicit-def $scc +--- +name: test_s_cmp_b32_literal_literal_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 70, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_literal_rhs{{$}} +# GCN: S_CMP_EQ_U32 %0, 70, implicit-def $scc +--- +name: test_s_cmp_b32_literal_literal_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 %0, 70, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_inlineimm_lhs{{$}} +# GCN: S_CMP_EQ_U32 64, 80, implicit-def $scc +--- +name: test_s_cmp_b32_literal_inlineimm_lhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 64, %0, implicit-def $scc +... + +# GCN-LABEL: name: test_s_cmp_b32_literal_inlineimm_rhs{{$}} +# GCN: S_CMP_EQ_U32 80, 64, implicit-def $scc +--- +name: test_s_cmp_b32_literal_inlineimm_rhs +tracksRegLiveness: true +body: | + bb.0: + %0:sreg_32 = S_MOV_B32 80 + S_CMP_EQ_U32 %0, 64, implicit-def $scc +... diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir new file mode 100644 index 0000000000000..9af18758e2206 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-remat-v_pk_mov_b32.mir @@ -0,0 +1,49 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -run-pass=register-coalescer -o - %s | FileCheck %s + +--- +name: test_remat_v_pk_mov_b32 +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: test_remat_v_pk_mov_b32 + ; CHECK: bb.0: + ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) + ; CHECK-NEXT: liveins: $sgpr0 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_PK_MOV_B32_:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + ; CHECK-NEXT: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr0 + ; CHECK-NEXT: $exec = S_MOV_B64_term [[COPY]] + ; CHECK-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec + ; CHECK-NEXT: S_BRANCH %bb.1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.2(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[V_PK_MOV_B32_1:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_MOV_B32_1]], 8, [[V_PK_MOV_B32_1]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: [[V_PK_MOV_B32_2:%[0-9]+]]:vreg_64_align2 = V_PK_ADD_F32 8, [[V_PK_MOV_B32_2]], 8, [[V_PK_MOV_B32_2]], 11, 0, 0, 0, 0, implicit $mode, implicit $exec + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2: + ; CHECK-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_1]] + ; CHECK-NEXT: S_NOP 0, implicit [[V_PK_MOV_B32_2]] + ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_PK_MOV_B32_]] + bb.0: + liveins: $sgpr0 + %0:vreg_64_align2 = V_PK_MOV_B32 8, 0, 8, 0, 0, 0, 0, 0, 0, implicit $exec + %1:vreg_64_align2 = COPY %0:vreg_64_align2 + %2:vreg_64_align2 = COPY %0:vreg_64_align2 + %3:sreg_64 = COPY $sgpr0 + $exec = S_MOV_B64_term %3:sreg_64 + S_CBRANCH_EXECZ %bb.2, implicit $exec + S_BRANCH %bb.1 + + bb.1: + %1:vreg_64_align2 = V_PK_ADD_F32 8, %1, 8, %1, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + %2:vreg_64_align2 = V_PK_ADD_F32 8, %2, 8, %2, 11, 0, 0, 0, 0, implicit $mode, implicit $exec + + bb.2: + S_NOP 0, implicit %1 + S_NOP 0, implicit %2 + S_ENDPGM 0, implicit %0 +... diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll index 585a331e55094..bef29dfecef4c 100644 --- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll +++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll @@ -1318,11 +1318,10 @@ define void @sqrt_v6bf16(ptr %x) { ; CHECK: # %bb.0: ; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; CHECK-NEXT: vle16.v v8, (a0) -; CHECK-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; CHECK-NEXT: vfwcvtbf16.f.f.v v10, v8 ; CHECK-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; CHECK-NEXT: vfsqrt.v v8, v10 -; CHECK-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; CHECK-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; CHECK-NEXT: vfncvtbf16.f.f.w v10, v8 ; CHECK-NEXT: vse16.v v10, (a0) ; CHECK-NEXT: ret @@ -1371,11 +1370,10 @@ define void @sqrt_v6f16(ptr %x) { ; ZVFHMIN: # %bb.0: ; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma ; ZVFHMIN-NEXT: vle16.v v8, (a0) -; ZVFHMIN-NEXT: vsetivli zero, 8, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfwcvt.f.f.v v10, v8 ; ZVFHMIN-NEXT: vsetvli zero, zero, e32, m2, ta, ma ; ZVFHMIN-NEXT: vfsqrt.v v8, v10 -; ZVFHMIN-NEXT: vsetivli zero, 6, e16, m1, ta, ma +; ZVFHMIN-NEXT: vsetvli zero, zero, e16, m1, ta, ma ; ZVFHMIN-NEXT: vfncvt.f.f.w v10, v8 ; ZVFHMIN-NEXT: vse16.v v10, (a0) ; ZVFHMIN-NEXT: ret diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll index c6ee9e34dc207..5cd9b77af82cf 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll @@ -5069,3 +5069,51 @@ define @vfwmaccbf16_vf( %a, bfloat %b, %2 = call @llvm.riscv.vfadd( poison, %1, %d, iXLen 7, iXLen %vl) ret %2 } + +define @vfsqrt( %a) { +; NOVLOPT-LABEL: vfsqrt: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: fsrmi a0, 0 +; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; NOVLOPT-NEXT: vfsqrt.v v10, v8 +; NOVLOPT-NEXT: fsrm a0 +; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; NOVLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; NOVLOPT-NEXT: vmv4r.v v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfsqrt: +; VLOPT: # %bb.0: +; VLOPT-NEXT: fsrmi a0, 0 +; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; VLOPT-NEXT: vfsqrt.v v10, v8 +; VLOPT-NEXT: fsrm a0 +; VLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; VLOPT-NEXT: vmv4r.v v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfsqrt.nxv4f32( poison, %a, iXLen 0, iXLen 7) + %2 = call @llvm.riscv.vfwmacc( poison, %a, %1, iXLen 7, iXLen 6, iXLen 0) + ret %2 +} + +define @vfrsqrt7( %a) { +; NOVLOPT-LABEL: vfrsqrt7: +; NOVLOPT: # %bb.0: +; NOVLOPT-NEXT: vsetivli zero, 7, e32, m2, ta, ma +; NOVLOPT-NEXT: vfrsqrt7.v v10, v8 +; NOVLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; NOVLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; NOVLOPT-NEXT: vmv4r.v v8, v12 +; NOVLOPT-NEXT: ret +; +; VLOPT-LABEL: vfrsqrt7: +; VLOPT: # %bb.0: +; VLOPT-NEXT: vsetivli zero, 6, e32, m2, ta, ma +; VLOPT-NEXT: vfrsqrt7.v v10, v8 +; VLOPT-NEXT: vfwmacc.vv v12, v8, v10 +; VLOPT-NEXT: vmv4r.v v8, v12 +; VLOPT-NEXT: ret + %1 = call @llvm.riscv.vfrsqrt7.nxv4f32( poison, %a, iXLen 7) + %2 = call @llvm.riscv.vfwmacc( poison, %a, %1, iXLen 7, iXLen 6, iXLen 0) + ret %2 +} diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir index 0475a988e9851..cb43a89ea3bc6 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir +++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.mir @@ -141,6 +141,46 @@ body: | %y:vr = PseudoVADD_VV_M1 $noreg, %x, $noreg, 1, 4 /* e16 */, 0 ... --- +name: vfsqrt_nofpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfsqrt_nofpexcept + ; CHECK: %x:vrm2 = nofpexcept PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 6, 5 /* e32 */, 3 /* ta, ma */, implicit $frm + ; CHECK-NEXT: early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4 /* e16 */, 3 /* ta, ma */, implicit $frm + %x:vrm2 = nofpexcept PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5, 3, implicit $frm + early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4, 3, implicit $frm +... +--- +name: vfsqrt_fpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfsqrt_fpexcept + ; CHECK: %x:vrm2 = PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5 /* e32 */, 3 /* ta, ma */, implicit $frm + ; CHECK-NEXT: early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4 /* e16 */, 3 /* ta, ma */, implicit $frm + %x:vrm2 = PseudoVFSQRT_V_M2_E32 $noreg, $noreg, 7, 8, 5, 3, implicit $frm + early-clobber %y:vr = nofpexcept PseudoVFNCVTBF16_F_F_W_M1_E16 $noreg, %x, 7, 6, 4, 3, implicit $frm +... +--- +name: vfrsqrt7_nofpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfrsqrt7_nofpexcept + ; CHECK: %x:vrm2 = nofpexcept PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vrm2 = nofpexcept PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5, 0 + %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- +name: vfrsqrt7_fpexcept +body: | + bb.0: + ; CHECK-LABEL: name: vfrsqrt7_fpexcept + ; CHECK: %x:vrm2 = PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5 /* e32 */, 0 /* tu, mu */ + ; CHECK-NEXT: %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 /* tu, mu */ + %x:vrm2 = PseudoVFRSQRT7_V_M2_E32 $noreg, $noreg, 7, 5, 0 + %y:vrm2 = PseudoVADD_VV_M2 $noreg, %x, $noreg, 1, 5 /* e32 */, 0 +... +--- name: vwadd_tied_vs1 body: | bb.0: diff --git a/llvm/test/CodeGen/X86/bfloat-calling-conv.ll b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll new file mode 100644 index 0000000000000..ea4d32bae9ccb --- /dev/null +++ b/llvm/test/CodeGen/X86/bfloat-calling-conv.ll @@ -0,0 +1,1162 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=SSE2 %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+sse2 < %s | FileCheck -check-prefixes=FAST_ISEL_SSE2 %s +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=AVX512BF16 %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avx512bf16,avx512vl < %s | FileCheck -check-prefixes=FAST_ISEL_AVX512BF16 %s +; RUN: llc -fast-isel=false -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=AVXNECONVERT %s +; RUN: llc -fast-isel -mtriple=x86_64-linux-unknown -mattr=+avxneconvert < %s | FileCheck -check-prefixes=FAST_ISEL_AVXNECONVERT %s + +define bfloat @return_arg_bf16(bfloat %x) #0 { +; SSE2-LABEL: return_arg_bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %rax +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: popq %rax +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $0, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $0, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret bfloat %x +} + +define <2 x bfloat> @return_arg_v2bf16(<2 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v2bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v2bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: addq $40, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v2bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v2bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v2bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v2bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <2 x bfloat> %x +} + +define <3 x bfloat> @return_arg_v3bf16(<3 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v3bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v3bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: addq $40, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v3bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v3bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $2, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vpextrw $1, %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm0, %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm1, %eax +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v3bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v3bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $2, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpextrw $1, %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm0, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm1, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm1, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <3 x bfloat> %x +} + +define <4 x bfloat> @return_arg_v4bf16(<4 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v4bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v4bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; FAST_ISEL_SSE2-NEXT: addq $56, %rsp +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v4bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v4bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v4bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v4bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <4 x bfloat> %x +} + +define <8 x bfloat> @return_arg_v8bf16(<8 x bfloat> %x) #0 { +; SSE2-LABEL: return_arg_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v8bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FAST_ISEL_SSE2-NEXT: addq $56, %rsp +; FAST_ISEL_SSE2-NEXT: popq %rbx +; FAST_ISEL_SSE2-NEXT: popq %r14 +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v8bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v8bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v8bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v8bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <8 x bfloat> %x +} + +define <16 x bfloat> @return_arg_v16bf16(<16 x bfloat> %x) #0 { +; +; SSE2-LABEL: return_arg_v16bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: retq +; +; FAST_ISEL_SSE2-LABEL: return_arg_v16bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $104, %rsp +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: addq $104, %rsp +; FAST_ISEL_SSE2-NEXT: popq %rbx +; FAST_ISEL_SSE2-NEXT: popq %r14 +; FAST_ISEL_SSE2-NEXT: retq +; +; AVX512BF16-LABEL: return_arg_v16bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: retq +; +; FAST_ISEL_AVX512BF16-LABEL: return_arg_v16bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: retq +; +; AVXNECONVERT-LABEL: return_arg_v16bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: retq +; +; FAST_ISEL_AVXNECONVERT-LABEL: return_arg_v16bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: retq + ret <16 x bfloat> %x +} + +declare bfloat @returns_bf16(bfloat) +declare <2 x bfloat> @returns_v2bf16(<2 x bfloat>) +declare <3 x bfloat> @returns_v3bf16(<3 x bfloat>) +declare <4 x bfloat> @returns_v4bf16(<4 x bfloat>) +declare <8 x bfloat> @returns_v8bf16(<8 x bfloat>) +declare <16 x bfloat> @returns_v16bf16(<16 x bfloat>) + +define bfloat @call_ret_bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: pinsrw $0, (%rdi), %xmm0 +; SSE2-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %rax +; FAST_ISEL_SSE2-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: callq returns_bf16@PLT +; +; AVX512BF16-LABEL: call_ret_bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVX512BF16-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vpinsrw $0, (%rdi), %xmm0, %xmm0 +; AVXNECONVERT-NEXT: callq returns_bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movzwl (%rdi), %eax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_bf16@PLT + %val = load bfloat, ptr %ptr + call bfloat @returns_bf16(bfloat %val) + unreachable +} + +define <2 x bfloat> @call_ret_v2bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v2bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v2bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: movl (%rdi), %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, (%rsp) +; FAST_ISEL_SSE2-NEXT: movdqa (%rsp), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq returns_v2bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v2bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX512BF16-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v2bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v2bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v2bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVXNECONVERT-NEXT: callq returns_v2bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v2bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v2bf16@PLT + %val = load <2 x bfloat>, ptr %ptr + call <2 x bfloat> @returns_v2bf16(<2 x bfloat> %val) + unreachable +} + +define <3 x bfloat> @call_ret_v3bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v3bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movl 4(%rdi), %eax +; SSE2-NEXT: pinsrw $0, %eax, %xmm1 +; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; SSE2-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v3bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $40, %rsp +; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax +; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx +; FAST_ISEL_SSE2-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_SSE2-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movl %eax, %ecx +; FAST_ISEL_SSE2-NEXT: shll $16, %ecx +; FAST_ISEL_SSE2-NEXT: movd %ecx, %xmm0 +; FAST_ISEL_SSE2-NEXT: shrq $32, %rax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1] +; FAST_ISEL_SSE2-NEXT: movaps %xmm1, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq returns_v3bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v3bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v3bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: movq (%rdi), %rax +; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx +; FAST_ISEL_AVX512BF16-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: movl %eax, %ecx +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %ecx +; FAST_ISEL_AVX512BF16-NEXT: vmovd %ecx, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: shrq $32, %rax +; FAST_ISEL_AVX512BF16-NEXT: shll $16, %eax +; FAST_ISEL_AVX512BF16-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm2, %xmm2 +; FAST_ISEL_AVX512BF16-NEXT: vmovd %xmm2, %eax +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVX512BF16-NEXT: vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVX512BF16-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v3bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v3bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: movl 4(%rdi), %eax +; AVXNECONVERT-NEXT: vpinsrw $0, %eax, %xmm0, %xmm0 +; AVXNECONVERT-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVXNECONVERT-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero +; AVXNECONVERT-NEXT: callq returns_v3bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v3bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movq (%rdi), %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: andl $-65536, %ecx # imm = 0xFFFF0000 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %ecx, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: shll $16, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm2, %xmm2 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %xmm2, %eax +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vmovq %xmm0, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: movl %eax, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: shrl $16, %ecx +; FAST_ISEL_AVXNECONVERT-NEXT: vpinsrw $0, %ecx, %xmm0, %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_AVXNECONVERT-NEXT: shrq $32, %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovd %eax, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpbroadcastw %xmm1, %xmm1 +; FAST_ISEL_AVXNECONVERT-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3,4,5,6,7] +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v3bf16@PLT + %val = load <3 x bfloat>, ptr %ptr + call <3 x bfloat> @returns_v3bf16(<3 x bfloat> %val) + unreachable +} + +define <4 x bfloat> @call_ret_v4bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v4bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE2-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v4bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: movq (%rdi), %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; FAST_ISEL_SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] +; FAST_ISEL_SSE2-NEXT: callq returns_v4bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v4bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVX512BF16-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v4bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v4bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v4bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; AVXNECONVERT-NEXT: callq returns_v4bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v4bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v4bf16@PLT + %val = load <4 x bfloat>, ptr %ptr + call <4 x bfloat> @returns_v4bf16(<4 x bfloat> %val) + unreachable +} + +define <8 x bfloat> @call_ret_v8bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v8bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v8bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $56, %rsp +; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1 +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] +; FAST_ISEL_SSE2-NEXT: callq returns_v8bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v8bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovaps (%rdi), %xmm0 +; AVX512BF16-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v8bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %xmm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v8bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v8bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0 +; AVXNECONVERT-NEXT: callq returns_v8bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v8bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %xmm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v8bf16@PLT + %val = load <8 x bfloat>, ptr %ptr + call <8 x bfloat> @returns_v8bf16(<8 x bfloat> %val) + unreachable +} + +define <16 x bfloat> @call_ret_v16bf16(ptr %ptr) #0 { +; +; SSE2-LABEL: call_ret_v16bf16: +; SSE2: # %bb.0: +; SSE2-NEXT: pushq %rax +; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movaps 16(%rdi), %xmm1 +; SSE2-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_SSE2-LABEL: call_ret_v16bf16: +; FAST_ISEL_SSE2: # %bb.0: +; FAST_ISEL_SSE2-NEXT: pushq %r14 +; FAST_ISEL_SSE2-NEXT: pushq %rbx +; FAST_ISEL_SSE2-NEXT: subq $104, %rsp +; FAST_ISEL_SSE2-NEXT: movdqa (%rdi), %xmm1 +; FAST_ISEL_SSE2-NEXT: movdqa 16(%rdi), %xmm0 +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: movd %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $7, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $6, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $5, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $4, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $3, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $2, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: pextrw $1, %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movd %eax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movd %xmm1, %eax +; FAST_ISEL_SSE2-NEXT: shll $16, %eax +; FAST_ISEL_SSE2-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %r14d +; FAST_ISEL_SSE2-NEXT: orl %ebx, %r14d +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %ebx +; FAST_ISEL_SSE2-NEXT: shll $16, %ebx +; FAST_ISEL_SSE2-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload +; FAST_ISEL_SSE2-NEXT: # xmm0 = mem[0],zero,zero,zero +; FAST_ISEL_SSE2-NEXT: callq __truncsfbf2@PLT +; FAST_ISEL_SSE2-NEXT: pextrw $0, %xmm0, %eax +; FAST_ISEL_SSE2-NEXT: movzwl %ax, %eax +; FAST_ISEL_SSE2-NEXT: orl %ebx, %eax +; FAST_ISEL_SSE2-NEXT: shlq $32, %rax +; FAST_ISEL_SSE2-NEXT: orq %r14, %rax +; FAST_ISEL_SSE2-NEXT: movq %rax, %xmm0 +; FAST_ISEL_SSE2-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; FAST_ISEL_SSE2-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload +; FAST_ISEL_SSE2-NEXT: callq returns_v16bf16@PLT +; +; AVX512BF16-LABEL: call_ret_v16bf16: +; AVX512BF16: # %bb.0: +; AVX512BF16-NEXT: pushq %rax +; AVX512BF16-NEXT: vmovaps (%rdi), %ymm0 +; AVX512BF16-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_AVX512BF16-LABEL: call_ret_v16bf16: +; FAST_ISEL_AVX512BF16: # %bb.0: +; FAST_ISEL_AVX512BF16-NEXT: pushq %rax +; FAST_ISEL_AVX512BF16-NEXT: vmovaps (%rdi), %ymm0 +; FAST_ISEL_AVX512BF16-NEXT: callq returns_v16bf16@PLT +; +; AVXNECONVERT-LABEL: call_ret_v16bf16: +; AVXNECONVERT: # %bb.0: +; AVXNECONVERT-NEXT: pushq %rax +; AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0 +; AVXNECONVERT-NEXT: callq returns_v16bf16@PLT +; +; FAST_ISEL_AVXNECONVERT-LABEL: call_ret_v16bf16: +; FAST_ISEL_AVXNECONVERT: # %bb.0: +; FAST_ISEL_AVXNECONVERT-NEXT: pushq %rax +; FAST_ISEL_AVXNECONVERT-NEXT: vmovaps (%rdi), %ymm0 +; FAST_ISEL_AVXNECONVERT-NEXT: callq returns_v16bf16@PLT + %val = load <16 x bfloat>, ptr %ptr + call <16 x bfloat> @returns_v16bf16(<16 x bfloat> %val) + unreachable +} + +attributes #0 = { nounwind } diff --git a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll index 7c100f579399d..67dec9178eeca 100644 --- a/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll +++ b/llvm/test/Transforms/InstCombine/select-with-bitwise-ops.ll @@ -20,6 +20,34 @@ define i32 @select_icmp_eq_and_1_0_or_2(i32 %x, i32 %y) { ret i32 %select } +define i32 @select_icmp_eq_and_1_0_or_2_disjoint(i32 %x, i32 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_disjoint( +; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = or disjoint i32 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i32 [[SELECT]] +; + %and = and i32 %x, 1 + %cmp = icmp eq i32 %and, 0 + %or = or disjoint i32 %y, 2 + %select = select i1 %cmp, i32 %y, i32 %or + ret i32 %select +} + +define i32 @select_icmp_eq_and_1_0_add_2_nsw_nuw(i32 %x, i32 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_add_2_nsw_nuw( +; CHECK-NEXT: [[AND:%.*]] = shl i32 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = add nuw nsw i32 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i32 [[SELECT]] +; + %and = and i32 %x, 1 + %cmp = icmp eq i32 %and, 0 + %or = add nsw nuw i32 %y, 2 + %select = select i1 %cmp, i32 %y, i32 %or + ret i32 %select +} + define <2 x i32> @select_icmp_eq_and_1_0_or_2_vec(<2 x i32> %x, <2 x i32> %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_or_2_vec( ; CHECK-NEXT: [[AND:%.*]] = shl <2 x i32> [[X:%.*]], splat (i32 1) @@ -1696,6 +1724,20 @@ define i8 @select_icmp_eq_and_1_0_lshr_fv(i8 %x, i8 %y) { ret i8 %select } +define i8 @select_icmp_eq_and_1_0_lshr_exact_fv(i8 %x, i8 %y) { +; CHECK-LABEL: @select_icmp_eq_and_1_0_lshr_exact_fv( +; CHECK-NEXT: [[AND:%.*]] = shl i8 [[X:%.*]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[AND]], 2 +; CHECK-NEXT: [[SELECT:%.*]] = lshr exact i8 [[Y:%.*]], [[TMP1]] +; CHECK-NEXT: ret i8 [[SELECT]] +; + %and = and i8 %x, 1 + %cmp = icmp eq i8 %and, 0 + %blshr = lshr exact i8 %y, 2 + %select = select i1 %cmp, i8 %y, i8 %blshr + ret i8 %select +} + define i8 @select_icmp_eq_and_1_0_lshr_tv(i8 %x, i8 %y) { ; CHECK-LABEL: @select_icmp_eq_and_1_0_lshr_tv( ; CHECK-NEXT: [[AND:%.*]] = shl i8 [[X:%.*]], 1 diff --git a/llvm/unittests/Target/DirectX/CMakeLists.txt b/llvm/unittests/Target/DirectX/CMakeLists.txt index 626c0d6384268..fd0d5a0dd52c1 100644 --- a/llvm/unittests/Target/DirectX/CMakeLists.txt +++ b/llvm/unittests/Target/DirectX/CMakeLists.txt @@ -8,10 +8,12 @@ set(LLVM_LINK_COMPONENTS Core DirectXCodeGen DirectXPointerTypeAnalysis + Passes Support ) add_llvm_target_unittest(DirectXTests CBufferDataLayoutTests.cpp PointerTypeAnalysisTests.cpp + UniqueResourceFromUseTests.cpp ) diff --git a/llvm/unittests/Target/DirectX/UniqueResourceFromUseTests.cpp b/llvm/unittests/Target/DirectX/UniqueResourceFromUseTests.cpp new file mode 100644 index 0000000000000..f272381c0c250 --- /dev/null +++ b/llvm/unittests/Target/DirectX/UniqueResourceFromUseTests.cpp @@ -0,0 +1,283 @@ +//===- llvm/unittests/Target/DirectX/PointerTypeAnalysisTests.cpp ---------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "DirectXTargetMachine.h" +#include "llvm/Analysis/DXILResource.h" +#include "llvm/AsmParser/Parser.h" +#include "llvm/CodeGen/CommandFlags.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/Passes/PassBuilder.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/SourceMgr.h" + +#include "gtest/gtest.h" + +using namespace llvm; +using namespace llvm::dxil; + +namespace { +class UniqueResourceFromUseTest : public testing::Test { +protected: + PassBuilder *PB; + ModuleAnalysisManager *MAM; + + virtual void SetUp() { + MAM = new ModuleAnalysisManager(); + PB = new PassBuilder(); + PB->registerModuleAnalyses(*MAM); + MAM->registerPass([&] { return DXILResourceTypeAnalysis(); }); + MAM->registerPass([&] { return DXILResourceBindingAnalysis(); }); + } + + virtual void TearDown() { + delete PB; + delete MAM; + } +}; + +TEST_F(UniqueResourceFromUseTest, TestTrivialUse) { + StringRef Assembly = R"( +define void @main() { +entry: + %handle = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 2, i32 3, i32 4, i1 false) + call void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) + call void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) + ret void +} + +declare target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32, i32, i32, i32, i1) +declare void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) + )"; + + LLVMContext Context; + SMDiagnostic Error; + auto M = parseAssemblyString(Assembly, Error, Context); + ASSERT_TRUE(M) << "Bad assembly?"; + + const DXILBindingMap &DBM = MAM->getResult(*M); + for (const Function &F : M->functions()) { + if (F.getName() != "a.func") { + continue; + } + + unsigned CalledResources = 0; + + for (const User *U : F.users()) { + const CallInst *CI = cast(U); + const Value *Handle = CI->getArgOperand(0); + const auto Bindings = DBM.findByUse(Handle); + ASSERT_EQ(Bindings.size(), 1u) + << "Handle should resolve into one resource"; + + auto Binding = Bindings[0].getBinding(); + EXPECT_EQ(0u, Binding.RecordID); + EXPECT_EQ(1u, Binding.Space); + EXPECT_EQ(2u, Binding.LowerBound); + EXPECT_EQ(3u, Binding.Size); + + CalledResources++; + } + + EXPECT_EQ(2u, CalledResources) + << "Expected 2 resolved call to create resource"; + } +} + +TEST_F(UniqueResourceFromUseTest, TestIndirectUse) { + StringRef Assembly = R"( +define void @foo() { + %handle = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 2, i32 3, i32 4, i1 false) + %handle2 = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %handle) + %handle3 = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %handle2) + %handle4 = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %handle3) + call void @a.func(target("dx.RawBuffer", float, 1, 0) %handle4) + ret void +} + +declare target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32, i32, i32, i32, i1) +declare void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) +declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %handle) + )"; + + LLVMContext Context; + SMDiagnostic Error; + auto M = parseAssemblyString(Assembly, Error, Context); + ASSERT_TRUE(M) << "Bad assembly?"; + + const DXILBindingMap &DBM = MAM->getResult(*M); + for (const Function &F : M->functions()) { + if (F.getName() != "a.func") { + continue; + } + + unsigned CalledResources = 0; + + for (const User *U : F.users()) { + const CallInst *CI = cast(U); + const Value *Handle = CI->getArgOperand(0); + const auto Bindings = DBM.findByUse(Handle); + ASSERT_EQ(Bindings.size(), 1u) + << "Handle should resolve into one resource"; + + auto Binding = Bindings[0].getBinding(); + EXPECT_EQ(0u, Binding.RecordID); + EXPECT_EQ(1u, Binding.Space); + EXPECT_EQ(2u, Binding.LowerBound); + EXPECT_EQ(3u, Binding.Size); + + CalledResources++; + } + + EXPECT_EQ(1u, CalledResources) + << "Expected 1 resolved call to create resource"; + } +} + +TEST_F(UniqueResourceFromUseTest, TestAmbigousIndirectUse) { + StringRef Assembly = R"( +define void @foo() { + %foo = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 1, i32 1, i32 1, i1 false) + %bar = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 2, i32 2, i32 2, i32 2, i1 false) + %baz = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 3, i32 3, i32 3, i32 3, i1 false) + %bat = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 4, i32 4, i32 4, i32 4, i1 false) + %a = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %foo, target("dx.RawBuffer", float, 1, 0) %bar) + %b = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %baz, target("dx.RawBuffer", float, 1, 0) %bat) + %handle = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %a, target("dx.RawBuffer", float, 1, 0) %b) + call void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) + ret void +} + +declare target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32, i32, i32, i32, i1) +declare void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) +declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %x, target("dx.RawBuffer", float, 1, 0) %y) + )"; + + LLVMContext Context; + SMDiagnostic Error; + auto M = parseAssemblyString(Assembly, Error, Context); + ASSERT_TRUE(M) << "Bad assembly?"; + + const DXILBindingMap &DBM = MAM->getResult(*M); + for (const Function &F : M->functions()) { + if (F.getName() != "a.func") { + continue; + } + + unsigned CalledResources = 0; + + for (const User *U : F.users()) { + const CallInst *CI = cast(U); + const Value *Handle = CI->getArgOperand(0); + const auto Bindings = DBM.findByUse(Handle); + ASSERT_EQ(Bindings.size(), 4u) + << "Handle should resolve into four resources"; + + auto Binding = Bindings[0].getBinding(); + EXPECT_EQ(0u, Binding.RecordID); + EXPECT_EQ(1u, Binding.Space); + EXPECT_EQ(1u, Binding.LowerBound); + EXPECT_EQ(1u, Binding.Size); + + Binding = Bindings[1].getBinding(); + EXPECT_EQ(1u, Binding.RecordID); + EXPECT_EQ(2u, Binding.Space); + EXPECT_EQ(2u, Binding.LowerBound); + EXPECT_EQ(2u, Binding.Size); + + Binding = Bindings[2].getBinding(); + EXPECT_EQ(2u, Binding.RecordID); + EXPECT_EQ(3u, Binding.Space); + EXPECT_EQ(3u, Binding.LowerBound); + EXPECT_EQ(3u, Binding.Size); + + Binding = Bindings[3].getBinding(); + EXPECT_EQ(3u, Binding.RecordID); + EXPECT_EQ(4u, Binding.Space); + EXPECT_EQ(4u, Binding.LowerBound); + EXPECT_EQ(4u, Binding.Size); + + CalledResources++; + } + + EXPECT_EQ(1u, CalledResources) + << "Expected 1 resolved call to create resource"; + } +} + +TEST_F(UniqueResourceFromUseTest, TestConditionalUse) { + StringRef Assembly = R"( +define void @foo(i32 %n) { +entry: + %x = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 1, i32 1, i32 1, i32 1, i1 false) + %y = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32 4, i32 4, i32 4, i32 4, i1 false) + %cond = icmp eq i32 %n, 0 + br i1 %cond, label %bb.true, label %bb.false + +bb.true: + %handle_t = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %x) + br label %bb.exit + +bb.false: + %handle_f = call target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %y) + br label %bb.exit + +bb.exit: + %handle = phi target("dx.RawBuffer", float, 1, 0) [ %handle_t, %bb.true ], [ %handle_f, %bb.false ] + call void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) + ret void +} + +declare target("dx.RawBuffer", float, 1, 0) @llvm.dx.resource.handlefrombinding.tdx.RawBuffer_f32_1_0t(i32, i32, i32, i32, i1) +declare void @a.func(target("dx.RawBuffer", float, 1, 0) %handle) +declare target("dx.RawBuffer", float, 1, 0) @ind.func(target("dx.RawBuffer", float, 1, 0) %x) + )"; + + LLVMContext Context; + SMDiagnostic Error; + auto M = parseAssemblyString(Assembly, Error, Context); + ASSERT_TRUE(M) << "Bad assembly?"; + + const DXILBindingMap &DBM = MAM->getResult(*M); + for (const Function &F : M->functions()) { + if (F.getName() != "a.func") { + continue; + } + + unsigned CalledResources = 0; + + for (const User *U : F.users()) { + const CallInst *CI = cast(U); + const Value *Handle = CI->getArgOperand(0); + const auto Bindings = DBM.findByUse(Handle); + ASSERT_EQ(Bindings.size(), 2u) + << "Handle should resolve into four resources"; + + auto Binding = Bindings[0].getBinding(); + EXPECT_EQ(0u, Binding.RecordID); + EXPECT_EQ(1u, Binding.Space); + EXPECT_EQ(1u, Binding.LowerBound); + EXPECT_EQ(1u, Binding.Size); + + Binding = Bindings[1].getBinding(); + EXPECT_EQ(1u, Binding.RecordID); + EXPECT_EQ(4u, Binding.Space); + EXPECT_EQ(4u, Binding.LowerBound); + EXPECT_EQ(4u, Binding.Size); + + CalledResources++; + } + + EXPECT_EQ(1u, CalledResources) + << "Expected 1 resolved call to create resource"; + } +} + +} // namespace diff --git a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp index 0d5d86acaee89..97724100ba341 100644 --- a/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp +++ b/llvm/unittests/Transforms/Vectorize/SandboxVectorizer/SchedulerTest.cpp @@ -246,6 +246,78 @@ define void @foo(ptr noalias %ptr0, ptr noalias %ptr1) { EXPECT_TRUE(Sched.trySchedule({L0, L1})); } +// Check scheduling in the following order: {A0,A1},{B0,B1},{C0,C1},{D0,D1} +// assuming program order: B0,B1,C0,C1,D0,D1,E0,D1. +// This will effectively schedule nodes below already scheduled nodes, which +// can expose issues in the code that adds nodes to the ready list. +// For example, we schedule {D0,D1} while {C0,C1} are scheduled and there is +// a dependency D0->C0 and D1->C1. +// +// {A0,A1} {B0,B1} {C0,C1} {D0,D1} +// B0,B1 | S +// |\ | +// | C0,C1 | | S | S +// | | \ | | +// | | D0,D1 | | S +// | / | +// A0,A1 | S | S +// +------------------------+ +// | Legend |: DAG | +// | S: Scheduled | +TEST_F(SchedulerTest, ScheduledPredecessors) { + parseIR(C, R"IR( +define void @foo(ptr noalias %ptrA0, ptr noalias %ptrA1, + ptr noalias %ptrB0, ptr noalias %ptrB1, + ptr noalias %ptrD0, ptr noalias %ptrD1) { + %B0 = load i8, ptr %ptrB0 + %B1 = load i8, ptr %ptrB1 + %C0 = add i8 %B0, 0 + %C1 = add i8 %B1, 1 + store i8 %C0, ptr %ptrD0 + store i8 %C1, ptr %ptrD1 + store i8 %B0, ptr %ptrA0 + store i8 %B1, ptr %ptrA1 + ret void +} +)IR"); + llvm::Function *LLVMF = &*M->getFunction("foo"); + sandboxir::Context Ctx(C); + auto *F = Ctx.createFunction(LLVMF); + auto *BB = &*F->begin(); + auto It = BB->begin(); + auto *B1 = cast(&*It++); + auto *B0 = cast(&*It++); + auto *C1 = cast(&*It++); + auto *C0 = cast(&*It++); + auto *D1 = cast(&*It++); + auto *D0 = cast(&*It++); + auto *A1 = cast(&*It++); + auto *A0 = cast(&*It++); + auto *Ret = cast(&*It++); + (void)Ret; + + sandboxir::Scheduler Sched(getAA(*LLVMF), Ctx); + EXPECT_TRUE(Sched.trySchedule({A0, A1})); + // NOTE: We schedule the intermediate nodes between {A0,A1} and {B0,B1} by + // hand one by one to make sure they are scheduled in that order because + // the scheduler may reorder them a bit if we let it do it. + EXPECT_TRUE(Sched.trySchedule(D0)); + EXPECT_TRUE(Sched.trySchedule(D1)); + EXPECT_TRUE(Sched.trySchedule(C0)); + EXPECT_TRUE(Sched.trySchedule(C1)); + EXPECT_TRUE(Sched.trySchedule({B0, B1})); + // At this point all nodes must have been scheduled from B0,B1 to A0,A1. + // The ones in between are scheduled as single-instruction nodes. + // So when we attempt to schedule {C0,C1} we will need to reschedule. + // At this point we will trim the schedule from {C0,C1} upwards. + EXPECT_TRUE(Sched.trySchedule({C0, C1})); + // Now the schedule should only contain {C0,C1} which should be marked as + // "scheduled". + // {D0,D1} are below {C0,C1}, so we grow the DAG downwards, while + // {C0,C1} are marked as "scheduled" above them. + EXPECT_TRUE(Sched.trySchedule({D0, D1})); +} + TEST_F(SchedulerTest, DontCrossBBs) { parseIR(C, R"IR( define void @foo(ptr noalias %ptr0, ptr noalias %ptr1, i8 %v0, i8 %v1) { diff --git a/llvm/utils/gn/secondary/llvm/unittests/Target/DirectX/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/Target/DirectX/BUILD.gn index 0a8ec6ac0f789..5673e2056a721 100644 --- a/llvm/utils/gn/secondary/llvm/unittests/Target/DirectX/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/unittests/Target/DirectX/BUILD.gn @@ -16,5 +16,6 @@ unittest("DirextXTests") { sources = [ "CBufferDataLayoutTests.cpp", "PointerTypeAnalysisTests.cpp", + "UniqueResourceFromUseTests.cpp", ] } diff --git a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h index 517351cac6dbc..0608182f00b7e 100644 --- a/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h +++ b/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h @@ -23,8 +23,10 @@ #include "mlir-c/Diagnostics.h" #include "mlir-c/IR.h" +// clang-format off #include "mlir/Bindings/Python/Nanobind.h" #include "mlir-c/Bindings/Python/Interop.h" // This is expected after nanobind. +// clang-format on #include "llvm/ADT/Twine.h" // Raw CAPI type casters need to be declared before use, so always include them @@ -349,6 +351,7 @@ class pure_subclass { thisClass = metaclass(derivedClassName, nanobind::make_tuple(superClass), attributes); scope.attr(derivedClassName) = thisClass; + thisClass.attr("__module__") = scope.attr("__name__"); } template @@ -434,7 +437,7 @@ class mlir_attribute_subclass : public pure_subclass { const nanobind::object &superCls, GetTypeIDFunctionTy getTypeIDFunction = nullptr) : pure_subclass(scope, typeClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -465,10 +468,13 @@ class mlir_attribute_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_attribute: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Attribute) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirAttribute other) { return isaFunction(other); }, - nanobind::arg("other_attribute")); + nanobind::arg("other_attribute"), nanobind::sig(kIsinstanceSig)); def("__repr__", [superCls, captureTypeName](nanobind::object self) { return nanobind::repr(superCls(self)) .attr("replace")(superCls.attr("__name__"), captureTypeName); @@ -512,7 +518,7 @@ class mlir_type_subclass : public pure_subclass { const nanobind::object &superCls, GetTypeIDFunctionTy getTypeIDFunction = nullptr) : pure_subclass(scope, typeClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -542,13 +548,17 @@ class mlir_type_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_type: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Type) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirType other) { return isaFunction(other); }, - nanobind::arg("other_type")); + nanobind::arg("other_type"), nanobind::sig(kIsinstanceSig)); def("__repr__", [superCls, captureTypeName](nanobind::object self) { - return nanobind::repr(superCls(self)) - .attr("replace")(superCls.attr("__name__"), captureTypeName); + return nanobind::cast( + nanobind::repr(superCls(self)) + .attr("replace")(superCls.attr("__name__"), captureTypeName)); }); if (getTypeIDFunction) { // 'get_static_typeid' method. @@ -590,7 +600,7 @@ class mlir_value_subclass : public pure_subclass { IsAFunctionTy isaFunction, const nanobind::object &superCls) : pure_subclass(scope, valueClassName, superCls) { - // Casting constructor. Note that it hard, if not impossible, to properly + // Casting constructor. Note that it is hard, if not impossible, to properly // call chain to parent `__init__` in nanobind due to its special handling // for init functions that don't have a fully constructed self-reference, // which makes it impossible to forward it to `__init__` of a superclass. @@ -620,10 +630,13 @@ class mlir_value_subclass : public pure_subclass { thisClass.attr("__new__") = newCf; // 'isinstance' method. + static const char kIsinstanceSig[] = + "def isinstance(other_value: " MAKE_MLIR_PYTHON_QUALNAME( + "ir") ".Value) -> bool"; def_staticmethod( "isinstance", [isaFunction](MlirValue other) { return isaFunction(other); }, - nanobind::arg("other_value")); + nanobind::arg("other_value"), nanobind::sig(kIsinstanceSig)); } }; diff --git a/mlir/lib/Interfaces/FunctionInterfaces.cpp b/mlir/lib/Interfaces/FunctionInterfaces.cpp index 80f47a3f83676..57a8668117c68 100644 --- a/mlir/lib/Interfaces/FunctionInterfaces.cpp +++ b/mlir/lib/Interfaces/FunctionInterfaces.cpp @@ -199,8 +199,7 @@ void function_interface_impl::insertFunctionArguments( // There are 3 things that need to be updated: // - Function type. // - Arg attrs. - // - Block arguments of entry block. - Block &entry = op->getRegion(0).front(); + // - Block arguments of entry block, if not empty. // Update the argument attributes of the function. ArrayAttr oldArgAttrs = op.getArgAttrsAttr(); @@ -226,10 +225,15 @@ void function_interface_impl::insertFunctionArguments( setAllArgAttrDicts(op, newArgAttrs); } - // Update the function type and any entry block arguments. + // Update the function type. op.setFunctionTypeAttr(TypeAttr::get(newType)); - for (unsigned i = 0, e = argIndices.size(); i < e; ++i) - entry.insertArgument(argIndices[i] + i, argTypes[i], argLocs[i]); + + // Update entry block arguments, if not empty. + if (!op.isExternal()) { + Block &entry = op->getRegion(0).front(); + for (unsigned i = 0, e = argIndices.size(); i < e; ++i) + entry.insertArgument(argIndices[i] + i, argTypes[i], argLocs[i]); + } } void function_interface_impl::insertFunctionResults( @@ -279,8 +283,7 @@ void function_interface_impl::eraseFunctionArguments( // There are 3 things that need to be updated: // - Function type. // - Arg attrs. - // - Block arguments of entry block. - Block &entry = op->getRegion(0).front(); + // - Block arguments of entry block, if not empty. // Update the argument attributes of the function. if (ArrayAttr argAttrs = op.getArgAttrsAttr()) { @@ -292,9 +295,14 @@ void function_interface_impl::eraseFunctionArguments( setAllArgAttrDicts(op, newArgAttrs); } - // Update the function type and any entry block arguments. + // Update the function type. op.setFunctionTypeAttr(TypeAttr::get(newType)); - entry.eraseArguments(argIndices); + + // Update entry block arguments, if not empty. + if (!op.isExternal()) { + Block &entry = op->getRegion(0).front(); + entry.eraseArguments(argIndices); + } } void function_interface_impl::eraseFunctionResults(