From 6f8f459163a637b6d0435725aa63c149fdf914c6 Mon Sep 17 00:00:00 2001 From: Andrei Elovikov Date: Tue, 1 Apr 2025 07:59:52 -0700 Subject: [PATCH] [SYCL] Reduce compile time overhead of `StoreLambda` for simple kernels In particular, those that are trivially copyable and destructible (so, no accessors or other special classes as arguments). Doesn't seem to help :( --- sycl/include/sycl/detail/cg_types.hpp | 122 ++++++++++++++++---------- sycl/include/sycl/handler.hpp | 20 ++++- sycl/source/CMakeLists.txt | 1 + sycl/source/cg_types.cpp | 18 ++++ 4 files changed, 113 insertions(+), 48 deletions(-) create mode 100644 sycl/source/cg_types.cpp diff --git a/sycl/include/sycl/detail/cg_types.hpp b/sycl/include/sycl/detail/cg_types.hpp index fa144d8b0a792..f4776bfc5e2af 100644 --- a/sycl/include/sycl/detail/cg_types.hpp +++ b/sycl/include/sycl/detail/cg_types.hpp @@ -34,6 +34,10 @@ inline namespace _V1 { class interop_handle; class handler; namespace detail { +// Prevent argument from being removed by the optimized. Needed for different +// host functions referencing kernel that we instantiate but don't intend to +// call on host (e.g. to preserve symbols for the debugger). +__SYCL_EXPORT bool do_not_dce(void (*)(void *)); class HostTask; /// Type of the command group. @@ -163,6 +167,55 @@ class HostKernelBase { virtual void InstantiateKernelOnHost() = 0; }; +template +void InstantiateKernelOnHost(void *p) { + auto &MKernel = *static_cast(p); + using IDBuilder = sycl::detail::Builder; + if constexpr (std::is_same_v) { + runKernelWithoutArg(MKernel); + } else if constexpr (std::is_same_v>) { + sycl::id ID = InitializedVal::template get<0>(); + runKernelWithArg(MKernel, ID); + } else if constexpr (std::is_same_v> || + std::is_same_v>) { + constexpr bool HasOffset = std::is_same_v>; + if constexpr (!HasOffset) { + KernelArgType Item = IDBuilder::createItem( + InitializedVal::template get<1>(), + InitializedVal::template get<0>()); + runKernelWithArg(MKernel, Item); + } else { + KernelArgType Item = IDBuilder::createItem( + InitializedVal::template get<1>(), + InitializedVal::template get<0>(), + InitializedVal::template get<0>()); + runKernelWithArg(MKernel, Item); + } + } else if constexpr (std::is_same_v>) { + sycl::range Range = InitializedVal::template get<1>(); + sycl::id ID = InitializedVal::template get<0>(); + sycl::group Group = + IDBuilder::createGroup(Range, Range, Range, ID); + sycl::item GlobalItem = + IDBuilder::createItem(Range, ID, ID); + sycl::item LocalItem = + IDBuilder::createItem(Range, ID); + KernelArgType NDItem = + IDBuilder::createNDItem(GlobalItem, LocalItem, Group); + runKernelWithArg(MKernel, NDItem); + } else if constexpr (std::is_same_v>) { + sycl::range Range = InitializedVal::template get<1>(); + sycl::id ID = InitializedVal::template get<0>(); + KernelArgType Group = IDBuilder::createGroup(Range, Range, Range, ID); + runKernelWithArg(MKernel, Group); + } else { + // Assume that anything else can be default-constructed. If not, this + // should fail to compile and the implementor should implement a generic + // case for the new argument type. + runKernelWithArg(MKernel, KernelArgType{}); + } +} + // Class which stores specific lambda object. template class HostKernel : public HostKernelBase { @@ -181,55 +234,32 @@ class HostKernel : public HostKernelBase { // kernel code instructions with source code lines. // NOTE: InstatiateKernelOnHost() should not be called. void InstantiateKernelOnHost() override { - using IDBuilder = sycl::detail::Builder; - if constexpr (std::is_same_v) { - runKernelWithoutArg(MKernel); - } else if constexpr (std::is_same_v>) { - sycl::id ID = InitializedVal::template get<0>(); - runKernelWithArg(MKernel, ID); - } else if constexpr (std::is_same_v> || - std::is_same_v>) { - constexpr bool HasOffset = - std::is_same_v>; - if constexpr (!HasOffset) { - KernelArgType Item = IDBuilder::createItem( - InitializedVal::template get<1>(), - InitializedVal::template get<0>()); - runKernelWithArg(MKernel, Item); - } else { - KernelArgType Item = IDBuilder::createItem( - InitializedVal::template get<1>(), - InitializedVal::template get<0>(), - InitializedVal::template get<0>()); - runKernelWithArg(MKernel, Item); - } - } else if constexpr (std::is_same_v>) { - sycl::range Range = InitializedVal::template get<1>(); - sycl::id ID = InitializedVal::template get<0>(); - sycl::group Group = - IDBuilder::createGroup(Range, Range, Range, ID); - sycl::item GlobalItem = - IDBuilder::createItem(Range, ID, ID); - sycl::item LocalItem = - IDBuilder::createItem(Range, ID); - KernelArgType NDItem = - IDBuilder::createNDItem(GlobalItem, LocalItem, Group); - runKernelWithArg(MKernel, NDItem); - } else if constexpr (std::is_same_v>) { - sycl::range Range = InitializedVal::template get<1>(); - sycl::id ID = InitializedVal::template get<0>(); - KernelArgType Group = - IDBuilder::createGroup(Range, Range, Range, ID); - runKernelWithArg(MKernel, Group); - } else { - // Assume that anything else can be default-constructed. If not, this - // should fail to compile and the implementor should implement a generic - // case for the new argument type. - runKernelWithArg(MKernel, KernelArgType{}); - } + detail::InstantiateKernelOnHost(&MKernel); } }; +class SimpleHostKernel : public HostKernelBase { + std::unique_ptr KernelBytes; + +public: + template + SimpleHostKernel(const KernelType &KernelFunc, KernelArgType *, std::integral_constant) + : KernelBytes(new( + std::align_val_t(alignof(KernelType))) char[sizeof(KernelType)]) { + std::memcpy(KernelBytes.get(), &KernelFunc, sizeof(KernelType)); + // Hopefully, minimal run-time overhead: + static thread_local auto ignore = do_not_dce( + &detail::InstantiateKernelOnHost); + (void)ignore; + } + char *getPtr() override { return KernelBytes.get(); } + ~SimpleHostKernel() override = default; + void InstantiateKernelOnHost() override { + // We do this in the ctor instead. + } +}; + + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index e487a1826b982..1110bbbc10508 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -739,8 +739,24 @@ class __SYCL_EXPORT handler { // Not using `std::make_unique` to avoid unnecessary instantiations of // `std::unique_ptr>`. Only // `std::unique_ptr` is necessary. - MHostKernel.reset(new detail::HostKernel( - std::forward(KernelFunc))); + +#if __has_builtin(__is_trivially_copyable) && \ + __has_builtin(__is_trivially_destructible) + // libstdc++ implementation of std::is_trivially_[copyable|destructible]_v + // has more compile time impact than the benefits of using single + // `SimpleHostKernel` vs. intstantiating `HostKernel` for each + // `KernelFunc`. As such, only do this optimization of the compilation time + // when using `clang` as host compiler. + if constexpr (__is_trivially_copyable(KernelType) && + __is_trivially_destructible(KernelType)) + MHostKernel.reset(new detail::SimpleHostKernel( + std::forward(KernelFunc), + static_cast(nullptr), + std::integral_constant{})); + else +#endif + MHostKernel.reset(new detail::HostKernel( + std::forward(KernelFunc))); constexpr bool KernelHasName = detail::getKernelName() != nullptr && diff --git a/sycl/source/CMakeLists.txt b/sycl/source/CMakeLists.txt index 2570921a2d565..76ac314cad049 100644 --- a/sycl/source/CMakeLists.txt +++ b/sycl/source/CMakeLists.txt @@ -233,6 +233,7 @@ set(SYCL_COMMON_SOURCES "builtins/math_functions.cpp" "builtins/native_math_functions.cpp" "builtins/relational_functions.cpp" + "cg_types.cpp" "detail/accessor_impl.cpp" "detail/allowlist.cpp" "detail/bindless_images.cpp" diff --git a/sycl/source/cg_types.cpp b/sycl/source/cg_types.cpp new file mode 100644 index 0000000000000..65e346377b2f2 --- /dev/null +++ b/sycl/source/cg_types.cpp @@ -0,0 +1,18 @@ +//==---- cg_types.cpp - Auxiliary types required by command group class ----==// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include + +namespace sycl { +inline namespace _V1 { + +namespace detail { +__SYCL_EXPORT bool do_not_dce(void (*)(void *)) { return true; } +} // namespace detail +} // namespace _V1 +} // namespace sycl