Skip to content

Commit 6f8f459

Browse files
[SYCL] Reduce compile time overhead of StoreLambda for simple kernels
In particular, those that are trivially copyable and destructible (so, no accessors or other special classes as arguments). Doesn't seem to help :(
1 parent ac30c32 commit 6f8f459

File tree

4 files changed

+113
-48
lines changed

4 files changed

+113
-48
lines changed

sycl/include/sycl/detail/cg_types.hpp

Lines changed: 76 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,10 @@ inline namespace _V1 {
3434
class interop_handle;
3535
class handler;
3636
namespace detail {
37+
// Prevent argument from being removed by the optimized. Needed for different
38+
// host functions referencing kernel that we instantiate but don't intend to
39+
// call on host (e.g. to preserve symbols for the debugger).
40+
__SYCL_EXPORT bool do_not_dce(void (*)(void *));
3741
class HostTask;
3842

3943
/// Type of the command group.
@@ -163,6 +167,55 @@ class HostKernelBase {
163167
virtual void InstantiateKernelOnHost() = 0;
164168
};
165169

170+
template <class KernelType, class KernelArgType, int Dims>
171+
void InstantiateKernelOnHost(void *p) {
172+
auto &MKernel = *static_cast<KernelType *>(p);
173+
using IDBuilder = sycl::detail::Builder;
174+
if constexpr (std::is_same_v<KernelArgType, void>) {
175+
runKernelWithoutArg(MKernel);
176+
} else if constexpr (std::is_same_v<KernelArgType, sycl::id<Dims>>) {
177+
sycl::id ID = InitializedVal<Dims, id>::template get<0>();
178+
runKernelWithArg<const KernelArgType &>(MKernel, ID);
179+
} else if constexpr (std::is_same_v<KernelArgType, item<Dims, true>> ||
180+
std::is_same_v<KernelArgType, item<Dims, false>>) {
181+
constexpr bool HasOffset = std::is_same_v<KernelArgType, item<Dims, true>>;
182+
if constexpr (!HasOffset) {
183+
KernelArgType Item = IDBuilder::createItem<Dims, HasOffset>(
184+
InitializedVal<Dims, range>::template get<1>(),
185+
InitializedVal<Dims, id>::template get<0>());
186+
runKernelWithArg<KernelArgType>(MKernel, Item);
187+
} else {
188+
KernelArgType Item = IDBuilder::createItem<Dims, HasOffset>(
189+
InitializedVal<Dims, range>::template get<1>(),
190+
InitializedVal<Dims, id>::template get<0>(),
191+
InitializedVal<Dims, id>::template get<0>());
192+
runKernelWithArg<KernelArgType>(MKernel, Item);
193+
}
194+
} else if constexpr (std::is_same_v<KernelArgType, nd_item<Dims>>) {
195+
sycl::range<Dims> Range = InitializedVal<Dims, range>::template get<1>();
196+
sycl::id<Dims> ID = InitializedVal<Dims, id>::template get<0>();
197+
sycl::group<Dims> Group =
198+
IDBuilder::createGroup<Dims>(Range, Range, Range, ID);
199+
sycl::item<Dims, true> GlobalItem =
200+
IDBuilder::createItem<Dims, true>(Range, ID, ID);
201+
sycl::item<Dims, false> LocalItem =
202+
IDBuilder::createItem<Dims, false>(Range, ID);
203+
KernelArgType NDItem =
204+
IDBuilder::createNDItem<Dims>(GlobalItem, LocalItem, Group);
205+
runKernelWithArg<const KernelArgType>(MKernel, NDItem);
206+
} else if constexpr (std::is_same_v<KernelArgType, sycl::group<Dims>>) {
207+
sycl::range<Dims> Range = InitializedVal<Dims, range>::template get<1>();
208+
sycl::id<Dims> ID = InitializedVal<Dims, id>::template get<0>();
209+
KernelArgType Group = IDBuilder::createGroup<Dims>(Range, Range, Range, ID);
210+
runKernelWithArg<KernelArgType>(MKernel, Group);
211+
} else {
212+
// Assume that anything else can be default-constructed. If not, this
213+
// should fail to compile and the implementor should implement a generic
214+
// case for the new argument type.
215+
runKernelWithArg<KernelArgType>(MKernel, KernelArgType{});
216+
}
217+
}
218+
166219
// Class which stores specific lambda object.
167220
template <class KernelType, class KernelArgType, int Dims>
168221
class HostKernel : public HostKernelBase {
@@ -181,55 +234,32 @@ class HostKernel : public HostKernelBase {
181234
// kernel code instructions with source code lines.
182235
// NOTE: InstatiateKernelOnHost() should not be called.
183236
void InstantiateKernelOnHost() override {
184-
using IDBuilder = sycl::detail::Builder;
185-
if constexpr (std::is_same_v<KernelArgType, void>) {
186-
runKernelWithoutArg(MKernel);
187-
} else if constexpr (std::is_same_v<KernelArgType, sycl::id<Dims>>) {
188-
sycl::id ID = InitializedVal<Dims, id>::template get<0>();
189-
runKernelWithArg<const KernelArgType &>(MKernel, ID);
190-
} else if constexpr (std::is_same_v<KernelArgType, item<Dims, true>> ||
191-
std::is_same_v<KernelArgType, item<Dims, false>>) {
192-
constexpr bool HasOffset =
193-
std::is_same_v<KernelArgType, item<Dims, true>>;
194-
if constexpr (!HasOffset) {
195-
KernelArgType Item = IDBuilder::createItem<Dims, HasOffset>(
196-
InitializedVal<Dims, range>::template get<1>(),
197-
InitializedVal<Dims, id>::template get<0>());
198-
runKernelWithArg<KernelArgType>(MKernel, Item);
199-
} else {
200-
KernelArgType Item = IDBuilder::createItem<Dims, HasOffset>(
201-
InitializedVal<Dims, range>::template get<1>(),
202-
InitializedVal<Dims, id>::template get<0>(),
203-
InitializedVal<Dims, id>::template get<0>());
204-
runKernelWithArg<KernelArgType>(MKernel, Item);
205-
}
206-
} else if constexpr (std::is_same_v<KernelArgType, nd_item<Dims>>) {
207-
sycl::range<Dims> Range = InitializedVal<Dims, range>::template get<1>();
208-
sycl::id<Dims> ID = InitializedVal<Dims, id>::template get<0>();
209-
sycl::group<Dims> Group =
210-
IDBuilder::createGroup<Dims>(Range, Range, Range, ID);
211-
sycl::item<Dims, true> GlobalItem =
212-
IDBuilder::createItem<Dims, true>(Range, ID, ID);
213-
sycl::item<Dims, false> LocalItem =
214-
IDBuilder::createItem<Dims, false>(Range, ID);
215-
KernelArgType NDItem =
216-
IDBuilder::createNDItem<Dims>(GlobalItem, LocalItem, Group);
217-
runKernelWithArg<const KernelArgType>(MKernel, NDItem);
218-
} else if constexpr (std::is_same_v<KernelArgType, sycl::group<Dims>>) {
219-
sycl::range<Dims> Range = InitializedVal<Dims, range>::template get<1>();
220-
sycl::id<Dims> ID = InitializedVal<Dims, id>::template get<0>();
221-
KernelArgType Group =
222-
IDBuilder::createGroup<Dims>(Range, Range, Range, ID);
223-
runKernelWithArg<KernelArgType>(MKernel, Group);
224-
} else {
225-
// Assume that anything else can be default-constructed. If not, this
226-
// should fail to compile and the implementor should implement a generic
227-
// case for the new argument type.
228-
runKernelWithArg<KernelArgType>(MKernel, KernelArgType{});
229-
}
237+
detail::InstantiateKernelOnHost<KernelType, KernelArgType, Dims>(&MKernel);
230238
}
231239
};
232240

241+
class SimpleHostKernel : public HostKernelBase {
242+
std::unique_ptr<char[]> KernelBytes;
243+
244+
public:
245+
template <typename KernelType, typename KernelArgType, int Dims>
246+
SimpleHostKernel(const KernelType &KernelFunc, KernelArgType *, std::integral_constant<int, Dims>)
247+
: KernelBytes(new(
248+
std::align_val_t(alignof(KernelType))) char[sizeof(KernelType)]) {
249+
std::memcpy(KernelBytes.get(), &KernelFunc, sizeof(KernelType));
250+
// Hopefully, minimal run-time overhead:
251+
static thread_local auto ignore = do_not_dce(
252+
&detail::InstantiateKernelOnHost<KernelType, KernelArgType, Dims>);
253+
(void)ignore;
254+
}
255+
char *getPtr() override { return KernelBytes.get(); }
256+
~SimpleHostKernel() override = default;
257+
void InstantiateKernelOnHost() override {
258+
// We do this in the ctor instead.
259+
}
260+
};
261+
262+
233263
} // namespace detail
234264
} // namespace _V1
235265
} // namespace sycl

sycl/include/sycl/handler.hpp

Lines changed: 18 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -739,8 +739,24 @@ class __SYCL_EXPORT handler {
739739
// Not using `std::make_unique` to avoid unnecessary instantiations of
740740
// `std::unique_ptr<HostKernel<...>>`. Only
741741
// `std::unique_ptr<HostKernelBase>` is necessary.
742-
MHostKernel.reset(new detail::HostKernel<KernelType, LambdaArgType, Dims>(
743-
std::forward<KernelTypeUniversalRef>(KernelFunc)));
742+
743+
#if __has_builtin(__is_trivially_copyable) && \
744+
__has_builtin(__is_trivially_destructible)
745+
// libstdc++ implementation of std::is_trivially_[copyable|destructible]_v
746+
// has more compile time impact than the benefits of using single
747+
// `SimpleHostKernel` vs. intstantiating `HostKernel` for each
748+
// `KernelFunc`. As such, only do this optimization of the compilation time
749+
// when using `clang` as host compiler.
750+
if constexpr (__is_trivially_copyable(KernelType) &&
751+
__is_trivially_destructible(KernelType))
752+
MHostKernel.reset(new detail::SimpleHostKernel(
753+
std::forward<KernelTypeUniversalRef>(KernelFunc),
754+
static_cast<LambdaArgType *>(nullptr),
755+
std::integral_constant<int, Dims>{}));
756+
else
757+
#endif
758+
MHostKernel.reset(new detail::HostKernel<KernelType, LambdaArgType, Dims>(
759+
std::forward<KernelTypeUniversalRef>(KernelFunc)));
744760

745761
constexpr bool KernelHasName =
746762
detail::getKernelName<KernelName>() != nullptr &&

sycl/source/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -233,6 +233,7 @@ set(SYCL_COMMON_SOURCES
233233
"builtins/math_functions.cpp"
234234
"builtins/native_math_functions.cpp"
235235
"builtins/relational_functions.cpp"
236+
"cg_types.cpp"
236237
"detail/accessor_impl.cpp"
237238
"detail/allowlist.cpp"
238239
"detail/bindless_images.cpp"

sycl/source/cg_types.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
//==---- cg_types.cpp - Auxiliary types required by command group class ----==//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include <sycl/detail/cg_types.hpp>
10+
11+
namespace sycl {
12+
inline namespace _V1 {
13+
14+
namespace detail {
15+
__SYCL_EXPORT bool do_not_dce(void (*)(void *)) { return true; }
16+
} // namespace detail
17+
} // namespace _V1
18+
} // namespace sycl

0 commit comments

Comments
 (0)