Skip to content

Commit 2978123

Browse files
authored
Handler-less kernel submit path (parallel_for with nd_range) (#19294)
This PR introduces a fully handler-less kernel submission path. The feature is not complete yet. For testing purposes we introduce the __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT macros to enable unit tests for the new handler-less path. This macro should not be used by the application, and a legacy handler-based path is used. Once the handler-less path is fully implemented, we will switch corresponding APIs to use it unconditionally and will remove the macros. This PR covers: 1. A parallel_for, nd_range based kernel submit. 2. Parallel_for queue shortcut, enqueue_functions extension and KHR free functions extension. 3. A scheduler-based kernel submission. 4. A new unit test which covers the host task and kernel ordering for an in-order queue (including the handler-less path).
1 parent 0031df1 commit 2978123

File tree

13 files changed

+415
-20
lines changed

13 files changed

+415
-20
lines changed

sycl/cmake/modules/AddSYCLUnitTest.cmake

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
# Internal function to create SYCL unit tests with code reuse
2-
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview file1.cpp, file2.cpp ...)
3-
function(add_sycl_unittest_internal test_dirname link_variant is_preview)
2+
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview is_no_cgh file1.cpp, file2.cpp ...)
3+
function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_cgh)
44
# Enable exception handling for these unit tests
55
set(LLVM_REQUIRES_EH ON)
66
set(LLVM_REQUIRES_RTTI ON)
@@ -34,7 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
3434
# Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its
3535
# parent scope.
3636
if (${is_preview})
37-
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
37+
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
38+
endif()
39+
40+
if (${is_no_cgh})
41+
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NoCGH")
3842
endif()
3943

4044
if ("${link_variant}" MATCHES "SHARED")
@@ -65,6 +69,18 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
6569
set(sycl_cache_suffix "_preview")
6670
endif()
6771

72+
if (${is_no_cgh})
73+
set(sycl_cache_suffix "_no_cgh")
74+
endif()
75+
76+
if (${is_no_cgh})
77+
target_compile_definitions(
78+
${test_dirname}
79+
PRIVATE
80+
__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
81+
)
82+
endif()
83+
6884
if (SYCL_ENABLE_XPTI_TRACING)
6985
target_compile_definitions(${test_dirname}
7086
PRIVATE XPTI_ENABLE_INSTRUMENTATION XPTI_STATIC_LIBRARY)
@@ -150,7 +166,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
150166
-Wno-inconsistent-missing-override
151167
)
152168
endif()
153-
154169
target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING)
155170
endfunction()
156171

@@ -160,6 +175,7 @@ endfunction()
160175
# the SYCL preview features enabled.
161176
# Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)`
162177
macro(add_sycl_unittest test_name_prefix link_variant)
163-
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE ${ARGN})
164-
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE ${ARGN})
178+
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN})
179+
add_sycl_unittest_internal(${test_name_prefix}_no_cgh ${link_variant} FALSE TRUE ${ARGN})
180+
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN})
165181
endmacro()

sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp

Lines changed: 30 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -259,10 +259,19 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
259259
typename KernelType, typename... ReductionsT>
260260
void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
261261
ReductionsT &&...Reductions) {
262-
submit(std::move(Q), [&](handler &CGH) {
263-
nd_launch<KernelName>(CGH, Range, KernelObj,
264-
std::forward<ReductionsT>(Reductions)...);
265-
});
262+
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
263+
// TODO The handler-less path does not support reductions yet.
264+
if constexpr (sizeof...(ReductionsT) == 0) {
265+
detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
266+
Range, KernelObj);
267+
} else
268+
#endif
269+
{
270+
submit(std::move(Q), [&](handler &CGH) {
271+
nd_launch<KernelName>(CGH, Range, KernelObj,
272+
std::forward<ReductionsT>(Reductions)...);
273+
});
274+
}
266275
}
267276

268277
template <typename KernelName = sycl::detail::auto_name, int Dimensions,
@@ -283,10 +292,23 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
283292
typename Properties, typename KernelType, typename... ReductionsT>
284293
void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
285294
const KernelType &KernelObj, ReductionsT &&...Reductions) {
286-
submit(std::move(Q), [&](handler &CGH) {
287-
nd_launch<KernelName>(CGH, Config, KernelObj,
288-
std::forward<ReductionsT>(Reductions)...);
289-
});
295+
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
296+
// TODO The handler-less path does not support reductions yet.
297+
if constexpr (sizeof...(ReductionsT) == 0) {
298+
ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
299+
Properties>
300+
ConfigAccess(Config);
301+
detail::submit_kernel_direct<KernelName>(
302+
std::move(Q), ConfigAccess.getProperties(), ConfigAccess.getRange(),
303+
KernelObj);
304+
} else
305+
#endif
306+
{
307+
submit(std::move(Q), [&](handler &CGH) {
308+
nd_launch<KernelName>(CGH, Config, KernelObj,
309+
std::forward<ReductionsT>(Reductions)...);
310+
});
311+
}
290312
}
291313

292314
template <int Dimensions, typename... ArgsT>

sycl/include/sycl/khr/free_function_commands.hpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,27 +153,45 @@ void launch_grouped(const queue &q, range<1> r, range<1> size,
153153
const KernelType &k,
154154
const sycl::detail::code_location &codeLoc =
155155
sycl::detail::code_location::current()) {
156+
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
157+
detail::submit_kernel_direct(q,
158+
ext::oneapi::experimental::empty_properties_t{},
159+
nd_range<1>(r, size), k);
160+
#else
156161
submit(
157162
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
158163
codeLoc);
164+
#endif
159165
}
160166
template <typename KernelType>
161167
void launch_grouped(const queue &q, range<2> r, range<2> size,
162168
const KernelType &k,
163169
const sycl::detail::code_location &codeLoc =
164170
sycl::detail::code_location::current()) {
171+
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
172+
detail::submit_kernel_direct(q,
173+
ext::oneapi::experimental::empty_properties_t{},
174+
nd_range<2>(r, size), k);
175+
#else
165176
submit(
166177
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
167178
codeLoc);
179+
#endif
168180
}
169181
template <typename KernelType>
170182
void launch_grouped(const queue &q, range<3> r, range<3> size,
171183
const KernelType &k,
172184
const sycl::detail::code_location &codeLoc =
173185
sycl::detail::code_location::current()) {
186+
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
187+
detail::submit_kernel_direct(q,
188+
ext::oneapi::experimental::empty_properties_t{},
189+
nd_range<3>(r, size), k);
190+
#else
174191
submit(
175192
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
176193
codeLoc);
194+
#endif
177195
}
178196

179197
template <typename... Args>

sycl/include/sycl/queue.hpp

Lines changed: 74 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,20 @@ template <backend BackendName, class SyclObjectT>
6262
auto get_native(const SyclObjectT &Obj)
6363
-> backend_return_t<BackendName, SyclObjectT>;
6464

65+
template <int Dims>
66+
event __SYCL_EXPORT submit_kernel_direct_with_event_impl(
67+
const queue &Queue, const nd_range<Dims> &Range,
68+
std::shared_ptr<detail::HostKernelBase> &HostKernel,
69+
detail::DeviceKernelInfo *DeviceKernelInfo,
70+
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
71+
72+
template <int Dims>
73+
void __SYCL_EXPORT submit_kernel_direct_without_event_impl(
74+
const queue &Queue, const nd_range<Dims> &Range,
75+
std::shared_ptr<detail::HostKernelBase> &HostKernel,
76+
detail::DeviceKernelInfo *DeviceKernelInfo,
77+
const detail::code_location &CodeLoc, bool IsTopCodeLoc);
78+
6579
namespace detail {
6680
class queue_impl;
6781

@@ -141,6 +155,51 @@ class __SYCL_EXPORT SubmissionInfo {
141155
};
142156

143157
} // namespace v1
158+
159+
template <typename KernelName = detail::auto_name, bool EventNeeded = false,
160+
typename PropertiesT, typename KernelType, int Dims>
161+
auto submit_kernel_direct(
162+
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
163+
const KernelType &KernelFunc,
164+
const detail::code_location &CodeLoc = detail::code_location::current()) {
165+
// TODO Properties not supported yet
166+
(void)Props;
167+
static_assert(
168+
std::is_same_v<PropertiesT,
169+
ext::oneapi::experimental::empty_properties_t>,
170+
"Setting properties not supported yet for no-CGH kernel submit.");
171+
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
172+
173+
using NameT =
174+
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
175+
using LambdaArgType =
176+
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
177+
static_assert(
178+
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
179+
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
180+
"must be either sycl::nd_item or be convertible from sycl::nd_item");
181+
using TransformedArgType = sycl::nd_item<Dims>;
182+
183+
std::shared_ptr<detail::HostKernelBase> HostKernel = std::make_shared<
184+
detail::HostKernel<KernelType, TransformedArgType, Dims>>(KernelFunc);
185+
186+
detail::DeviceKernelInfo *DeviceKernelInfoPtr =
187+
&detail::getDeviceKernelInfo<NameT>();
188+
189+
detail::KernelWrapper<detail::WrapAs::parallel_for, NameT, KernelType,
190+
TransformedArgType, PropertiesT>::wrap(KernelFunc);
191+
192+
if constexpr (EventNeeded) {
193+
return submit_kernel_direct_with_event_impl(
194+
Queue, Range, HostKernel, DeviceKernelInfoPtr,
195+
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
196+
} else {
197+
submit_kernel_direct_without_event_impl(
198+
Queue, Range, HostKernel, DeviceKernelInfoPtr,
199+
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
200+
}
201+
}
202+
144203
} // namespace detail
145204

146205
namespace ext ::oneapi ::experimental {
@@ -3205,11 +3264,21 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
32053264
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
32063265
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
32073266
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
3208-
return submit(
3209-
[&](handler &CGH) {
3210-
CGH.template parallel_for<KernelName>(Range, Rest...);
3211-
},
3212-
TlsCodeLocCapture.query());
3267+
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
3268+
// TODO The handler-less path does not support reductions yet.
3269+
if constexpr (sizeof...(RestT) == 1) {
3270+
return detail::submit_kernel_direct<KernelName, true>(
3271+
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
3272+
Rest...);
3273+
} else
3274+
#endif
3275+
{
3276+
return submit(
3277+
[&](handler &CGH) {
3278+
CGH.template parallel_for<KernelName>(Range, Rest...);
3279+
},
3280+
TlsCodeLocCapture.query());
3281+
}
32133282
}
32143283

32153284
/// parallel_for version with a kernel represented as a lambda + nd_range that

sycl/source/detail/queue_impl.cpp

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -420,6 +420,106 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF,
420420
return EventImpl;
421421
}
422422

423+
detail::EventImplPtr queue_impl::submit_kernel_direct_impl(
424+
const NDRDescT &NDRDesc,
425+
std::shared_ptr<detail::HostKernelBase> &HostKernel,
426+
detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent,
427+
const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
428+
429+
KernelData KData;
430+
431+
KData.setDeviceKernelInfoPtr(DeviceKernelInfo);
432+
KData.setKernelFunc(HostKernel->getPtr());
433+
KData.setNDRDesc(NDRDesc);
434+
435+
auto SubmitKernelFunc =
436+
[&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr {
437+
std::unique_ptr<detail::CG> CommandGroup;
438+
std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
439+
std::vector<std::shared_ptr<const void>> AuxiliaryResources;
440+
441+
KData.extractArgsAndReqsFromLambda();
442+
443+
CommandGroup.reset(new detail::CGExecKernel(
444+
KData.getNDRDesc(), HostKernel,
445+
nullptr, // Kernel
446+
nullptr, // KernelBundle
447+
std::move(CGData), std::move(KData).getArgs(),
448+
*KData.getDeviceKernelInfoPtr(), std::move(StreamStorage),
449+
std::move(AuxiliaryResources), detail::CGType::Kernel,
450+
UR_KERNEL_CACHE_CONFIG_DEFAULT,
451+
false, // KernelIsCooperative
452+
false, // KernelUsesClusterLaunch
453+
0, // KernelWorkGroupMemorySize
454+
CodeLoc));
455+
CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
456+
457+
return detail::Scheduler::getInstance().addCG(std::move(CommandGroup),
458+
*this, true);
459+
};
460+
461+
return submit_direct(CallerNeedsEvent, SubmitKernelFunc);
462+
}
463+
464+
template <typename SubmitCommandFuncType>
465+
detail::EventImplPtr
466+
queue_impl::submit_direct(bool CallerNeedsEvent,
467+
SubmitCommandFuncType &SubmitCommandFunc) {
468+
detail::CG::StorageInitHelper CGData;
469+
std::unique_lock<std::mutex> Lock(MMutex);
470+
471+
// Graphs are not supported yet for the no-handler path
472+
assert(!hasCommandGraph());
473+
474+
// Set the No Last Event Mode to false, since the no-handler path
475+
// does not support it yet.
476+
MNoLastEventMode.store(false, std::memory_order_relaxed);
477+
478+
// Used by queue_empty() and getLastEvent()
479+
MEmpty.store(false, std::memory_order_release);
480+
481+
// Sync with an external event
482+
std::optional<event> ExternalEvent = popExternalEvent();
483+
if (ExternalEvent) {
484+
CGData.MEvents.push_back(getSyclObjImpl(*ExternalEvent));
485+
}
486+
487+
// Sync with the last event for in order queue
488+
EventImplPtr &LastEvent = MDefaultGraphDeps.LastEventPtr;
489+
if (isInOrder() && LastEvent) {
490+
CGData.MEvents.push_back(LastEvent);
491+
}
492+
493+
// Barrier and un-enqueued commands synchronization for out or order queue
494+
if (!isInOrder()) {
495+
MMissedCleanupRequests.unset(
496+
[&](MissedCleanupRequestsType &MissedCleanupRequests) {
497+
for (auto &UpdatedGraph : MissedCleanupRequests)
498+
doUnenqueuedCommandCleanup(UpdatedGraph);
499+
MissedCleanupRequests.clear();
500+
});
501+
502+
if (MDefaultGraphDeps.LastBarrier &&
503+
!MDefaultGraphDeps.LastBarrier->isEnqueued()) {
504+
CGData.MEvents.push_back(MDefaultGraphDeps.LastBarrier);
505+
}
506+
}
507+
508+
EventImplPtr EventImpl = SubmitCommandFunc(CGData);
509+
510+
// Sync with the last event for in order queue
511+
if (isInOrder() && !EventImpl->isDiscarded()) {
512+
LastEvent = EventImpl;
513+
}
514+
515+
// Barrier and un-enqueued commands synchronization for out or order queue
516+
if (!isInOrder() && !EventImpl->isEnqueued()) {
517+
MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl);
518+
}
519+
520+
return CallerNeedsEvent ? EventImpl : nullptr;
521+
}
522+
423523
template <typename HandlerFuncT>
424524
event queue_impl::submitWithHandler(const std::vector<event> &DepEvents,
425525
bool CallerNeedsEvent,

0 commit comments

Comments
 (0)