Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
1d1bf16
[SYCL] Add scheduler-bypass for handler-less kernel submission path
slawekptak Sep 29, 2025
91ad6dd
Remove unnecessary EventImpl check
slawekptak Sep 29, 2025
4d06579
Extract the scheduler bypass logic into a separate function
slawekptak Sep 30, 2025
03deefe
Change the EventImpl var name back to original
slawekptak Sep 30, 2025
956d27a
Address review comments
slawekptak Oct 1, 2025
f0c9da5
Merge branch 'sycl' into no_handler_scheduler_bypass
slawekptak Oct 1, 2025
795375a
Merge branch 'sycl' into no_handler_scheduler_bypass
slawekptak Oct 3, 2025
f15f842
Allocate HostKernel on the scheduler path only
slawekptak Oct 3, 2025
e8dc229
Fix formatting
slawekptak Oct 3, 2025
bcf270f
Address review comments
slawekptak Oct 3, 2025
43b4b3a
Change the LaunchGroupedShortcutMoveKernelNoEvent unit test,
slawekptak Oct 3, 2025
2f0280d
[SYCL] Fallback path for handler-less kernel properties
slawekptak Oct 6, 2025
4832419
Add properties check to free function extension
slawekptak Oct 6, 2025
798c1ca
Merge branch 'sycl' into no_handler_scheduler_bypass
slawekptak Oct 6, 2025
2bd29d0
[SYCL] Remove assertion for graph support for handler-less kernel submit
slawekptak Oct 6, 2025
36fa311
Use scheduler bypass path only if no graph associated with the queue
slawekptak Oct 6, 2025
6500bf0
Merge branch 'no_handler_properties_fallback' into temp_no_handler_in…
slawekptak Oct 7, 2025
2ed4cb7
Merge branch 'no_handler_graph_record_followup' into temp_no_handler_…
slawekptak Oct 7, 2025
3b867c2
Temp - Remove the no-handler macro
slawekptak Oct 7, 2025
fa18fcd
Comment out the event check
slawekptak Oct 7, 2025
f464e17
Change the expected copy_count in test_num_kernel_copies
slawekptak Oct 7, 2025
40c39d6
Basic no-handler single task support
slawekptak Oct 7, 2025
c0345df
A fix for no last event mode
slawekptak Oct 7, 2025
1e77cae
Merge branch 'no_handler_single_task' into temp_no_handler_integratio…
slawekptak Oct 7, 2025
c06f56f
Merge branch 'sycl' into temp_no_handler_integration_v2
slawekptak Oct 7, 2025
f3c0959
Fix unused var
slawekptak Oct 7, 2025
9564695
Properties fallback for single_task
slawekptak Oct 8, 2025
549ae54
Update the expected kernel copies for single_task
slawekptak Oct 8, 2025
dbe0fb9
Update the kernel size test, add properties check for single_task
slawekptak Oct 8, 2025
963df1a
A fallback mechanism for kernels with kernel_handler
slawekptak Oct 8, 2025
38b9fe9
Remove the cg_types check
slawekptak Oct 8, 2025
48e7231
Temporary change - remove the single_task error message test
slawekptak Oct 8, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 18 additions & 9 deletions sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,19 @@ template <typename KernelName = sycl::detail::auto_name, typename KernelType>
void single_task(queue Q, const KernelType &KernelObj,
const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
submit(
std::move(Q),
[&](handler &CGH) { single_task<KernelName>(CGH, KernelObj); }, CodeLoc);
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
detail::submit_kernel_direct_single_task<KernelName>(
std::move(Q), empty_properties_t{}, KernelObj, CodeLoc);
} else {
submit(
std::move(Q),
[&](handler &CGH) { single_task<KernelName>(CGH, KernelObj); },
CodeLoc);
}
}

template <typename... ArgsT>
Expand Down Expand Up @@ -259,18 +269,17 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
ReductionsT &&...Reductions) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support reductions and kernel function
// properties yet.
if constexpr (sizeof...(ReductionsT) == 0 &&
!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct<KernelName>(std::move(Q), empty_properties_t{},
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dimensions>>::value)) {
detail::submit_kernel_direct_parallel_for<KernelName>(std::move(Q), empty_properties_t{},
Range, KernelObj);
} else
#endif
{
} else {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
Expand Down
45 changes: 26 additions & 19 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,17 +157,16 @@ template <typename KernelType, typename = typename std::enable_if_t<
void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support kernel function properties yet.
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct(
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<1>>::value)) {
detail::submit_kernel_direct_parallel_for(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<1>(r, size), std::forward<KernelType>(k));
} else
#endif
{
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
Expand All @@ -178,17 +177,16 @@ template <typename KernelType, typename = typename std::enable_if_t<
void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support kernel function properties yet.
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct(
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<2>>::value)) {
detail::submit_kernel_direct_parallel_for(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<2>(r, size), std::forward<KernelType>(k));
} else
#endif
{
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
Expand All @@ -199,17 +197,16 @@ template <typename KernelType, typename = typename std::enable_if_t<
void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
// TODO The handler-less path does not support kernel function properties yet.
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
detail::submit_kernel_direct(
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<3>>::value)) {
detail::submit_kernel_direct_parallel_for(
q, ext::oneapi::experimental::empty_properties_t{},
nd_range<3>(r, size), std::forward<KernelType>(k));
} else
#endif
{
} else {
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
Expand Down Expand Up @@ -323,7 +320,17 @@ template <typename KernelType>
void launch_task(const sycl::queue &q, const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
submit(q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
if constexpr (!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType,
void>::value)) {
detail::submit_kernel_direct_single_task(
q, ext::oneapi::experimental::empty_properties_t{}, k, codeLoc);
} else {
submit(
q, [&](handler &h) { launch_task<KernelType>(h, k); }, codeLoc);
}
}

template <typename... Args>
Expand Down
118 changes: 104 additions & 14 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ class __SYCL_EXPORT SubmissionInfo {

template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename PropertiesT, typename KernelTypeUniversalRef, int Dims>
auto submit_kernel_direct(
auto submit_kernel_direct_parallel_for(
const queue &Queue, PropertiesT Props, const nd_range<Dims> &Range,
KernelTypeUniversalRef &&KernelFunc,
const detail::code_location &CodeLoc = detail::code_location::current()) {
Expand Down Expand Up @@ -199,6 +199,22 @@ auto submit_kernel_direct(

detail::DeviceKernelInfo *DeviceKernelInfoPtr =
&detail::getDeviceKernelInfo<NameT>();
constexpr auto Info = detail::CompileTimeKernelInfo<NameT>;

assert(Info.Name != std::string_view{} && "Kernel must have a name!");

static_assert(
Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize,
"Unexpected kernel lambda size. This can be caused by an "
"external host compiler producing a lambda with an "
"unexpected layout. This is a limitation of the compiler."
"In many cases the difference is related to capturing constexpr "
"variables. In such cases removing constexpr specifier aligns the "
"captures between the host compiler and the device compiler."
"\n"
"In case of MSVC, passing "
"-fsycl-host-compiler-options='/std:c++latest' "
"might also help.");

if constexpr (EventNeeded) {
return submit_kernel_direct_with_event_impl(
Expand All @@ -211,6 +227,69 @@ auto submit_kernel_direct(
}
}

template <typename KernelName = detail::auto_name, bool EventNeeded = false,
typename PropertiesT, typename KernelTypeUniversalRef>
auto submit_kernel_direct_single_task(
const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc,
const detail::code_location &CodeLoc = detail::code_location::current()) {
// TODO Properties not supported yet
(void)Props;
static_assert(
std::is_same_v<PropertiesT,
ext::oneapi::experimental::empty_properties_t>,
"Setting properties not supported yet for no-CGH kernel submit.");
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);

using KernelType =
std::remove_const_t<std::remove_reference_t<KernelTypeUniversalRef>>;

using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;

detail::KernelWrapper<detail::WrapAs::single_task, NameT, KernelType,
void, PropertiesT>::wrap(KernelFunc);

HostKernelRef<KernelType, KernelTypeUniversalRef, void, 1>
HostKernel(std::forward<KernelTypeUniversalRef>(KernelFunc));

// Instantiating the kernel on the host improves debugging.
// Passing this pointer to another translation unit prevents optimization.
#ifndef NDEBUG
// TODO: call library to prevent dropping call due to optimization
(void)
detail::GetInstantiateKernelOnHostPtr<KernelType, void, 1>();
#endif

detail::DeviceKernelInfo *DeviceKernelInfoPtr =
&detail::getDeviceKernelInfo<NameT>();
constexpr auto Info = detail::CompileTimeKernelInfo<NameT>;

assert(Info.Name != std::string_view{} && "Kernel must have a name!");

static_assert(
Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize,
"Unexpected kernel lambda size. This can be caused by an "
"external host compiler producing a lambda with an "
"unexpected layout. This is a limitation of the compiler."
"In many cases the difference is related to capturing constexpr "
"variables. In such cases removing constexpr specifier aligns the "
"captures between the host compiler and the device compiler."
"\n"
"In case of MSVC, passing "
"-fsycl-host-compiler-options='/std:c++latest' "
"might also help.");

if constexpr (EventNeeded) {
return submit_kernel_direct_with_event_impl(
Queue, nd_range<1>{1, 1}, HostKernel, DeviceKernelInfoPtr,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
} else {
submit_kernel_direct_without_event_impl(
Queue, nd_range<1>{1, 1}, HostKernel, DeviceKernelInfoPtr,
TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel());
}
}

} // namespace detail

namespace ext ::oneapi ::experimental {
Expand Down Expand Up @@ -2721,13 +2800,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
"sycl::queue.single_task() requires a kernel instead of command group. "
"Use queue.submit() instead");

detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());
if constexpr (
std::is_same_v<PropertiesT,
ext::oneapi::experimental::empty_properties_t> &&
!(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod<
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<KernelType, void>::value)) {
(void)Properties;
return detail::submit_kernel_direct_single_task<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{},
KernelFunc, CodeLoc);
} else {
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.template single_task<KernelName, KernelType, PropertiesT>(
Properties, KernelFunc);
},
TlsCodeLocCapture.query());
}
}

/// single_task version with a kernel represented as a lambda.
Expand Down Expand Up @@ -3275,21 +3366,20 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
using KernelType = std::tuple_element_t<0, std::tuple<RestT...>>;

// TODO The handler-less path does not support reductions and kernel
// function properties yet.
if constexpr (sizeof...(RestT) == 1 &&
!(ext::oneapi::experimental::detail::
HasKernelPropertiesGetMethod<
const KernelType &>::value)) {
return detail::submit_kernel_direct<KernelName, true>(
const KernelType &>::value) &&
!(detail::KernelLambdaHasKernelHandlerArgT<
KernelType, sycl::nd_item<Dims>>::value)) {
return detail::submit_kernel_direct_parallel_for<KernelName, true>(
*this, ext::oneapi::experimental::empty_properties_t{}, Range,
Rest...);
} else
#endif
{
} else {
return submit(
[&](handler &CGH) {
CGH.template parallel_for<KernelName>(Range, Rest...);
Expand Down
8 changes: 8 additions & 0 deletions sycl/source/detail/queue_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -662,6 +662,14 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
: true) &&
!hasCommandGraph();

if (isInOrder()) {
if (SchedulerBypass) {
MNoLastEventMode.store(true, std::memory_order_relaxed);
} else {
MNoLastEventMode.store(false, std::memory_order_relaxed);
}
Comment on lines +668 to +670
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The else block is not needed because the MNoLastEventMode is already false. It was assigned on the L626

}

EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);

// Sync with the last event for in order queue. For scheduler-bypass flow,
Expand Down
4 changes: 2 additions & 2 deletions sycl/test-e2e/Basic/test_num_kernel_copies.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -29,13 +29,13 @@ int main(int argc, char **argv) {

kernel<1> krn1;
q.parallel_for(sycl::nd_range<1>{1, 1}, krn1);
assert(copy_count == 1);
assert(copy_count == 0);
assert(move_count == 0);
copy_count = 0;

kernel<2> krn2;
q.single_task(krn2);
assert(copy_count == 1);
assert(copy_count == 0);
assert(move_count == 0);
copy_count = 0;

Expand Down
2 changes: 1 addition & 1 deletion sycl/test/basic_tests/kernel_size_mismatch.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ int main() {
(void)A;
// expected-no-diagnostics
#else
// expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}}
// expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}}
#endif
}).wait();
}
14 changes: 0 additions & 14 deletions sycl/test/basic_tests/single_task_error_message.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,6 @@
#include <iostream>
#include <sycl/sycl.hpp>
int main() {
{
int varA = 42;
int varB = 42;
int sum = 0;
sycl::queue myQueue{};
{
myQueue
.single_task([&](sycl::handler &cgh) {
// expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}}
// expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}}
})
.wait();
}
}
{
int varA = 42;
int varB = 42;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,6 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutNoEvent) {
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1});
}

#if __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
TEST_F(FreeFunctionCommandsEventsTests,
LaunchGroupedShortcutMoveKernelNoEvent) {
mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
Expand Down Expand Up @@ -276,7 +275,6 @@ TEST_F(FreeFunctionCommandsEventsTests,
ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1);
ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2});
}
#endif

TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) {
mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch",
Expand Down
Loading
Loading