Skip to content
Merged
Show file tree
Hide file tree
Changes from 58 commits
Commits
Show all changes
74 commits
Select commit Hold shift + click to select a range
3223842
[SYCL] Handler-less kernel submit API
slawekptak Jul 3, 2025
fde19ca
Fix formatting
slawekptak Jul 3, 2025
13424de
Fix formatting
slawekptak Jul 4, 2025
fbc789d
Change the ExtendedSubmissionInfo to KernelRuntimeInfo,
slawekptak Jul 7, 2025
591b3ec
Added copy/move constructor and assignment operator
slawekptak Jul 8, 2025
d235b7c
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Jul 8, 2025
6641601
Add a no event submit and no handler compile flag
slawekptak Jul 11, 2025
0f41d5a
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Jul 14, 2025
a6e711e
Added a new configure option to build no handler submit path, changed
slawekptak Jul 14, 2025
9c8040e
Host task dependency test
slawekptak Jul 17, 2025
31cbdb9
Add a check for special captures
slawekptak Jul 18, 2025
c5cd091
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 11, 2025
998d592
Switch to the common kernel wrappers, fix the KRInfo function call
slawekptak Aug 11, 2025
4000c07
Enable no handler in the preview lib build, add no handler unit
slawekptak Aug 12, 2025
f8e9cd6
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 12, 2025
01af8bb
Unused argument fix and IsTopCodeLoc assignment
slawekptak Aug 12, 2025
4469e59
Implemented the barrier and un-enqueued commands synchronization
slawekptak Aug 13, 2025
ac1a5cf
Fix formatting
slawekptak Aug 13, 2025
5865f3a
Fixed #ifdef, added comment to a new function.
slawekptak Aug 13, 2025
072803c
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 19, 2025
27b3110
Address review comments
slawekptak Aug 20, 2025
9041e94
Updated Linux symbols
slawekptak Aug 21, 2025
ac2c5bb
Addressed more review comments
slawekptak Aug 21, 2025
8e155fb
Fix formatting
slawekptak Aug 21, 2025
502f637
Fix formatting, remove unused properties argument
slawekptak Aug 21, 2025
d708c93
Fix ProcessKernelRuntimeInfo call
slawekptak Aug 21, 2025
e9f6e4e
Fix unit test build and ProcessKernelRuntimeInfo calls
slawekptak Aug 21, 2025
057a7a5
Fix formatting
slawekptak Aug 21, 2025
77d92ca
Added single_task shortcut function support for no-handler
slawekptak Aug 22, 2025
85aaa5c
Fix formatting
slawekptak Aug 25, 2025
a54422a
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Aug 29, 2025
967d35e
Update KernelRuntimeInfo - change kernel name based cache pointer
slawekptak Aug 29, 2025
ec1ef89
Switch to DeviceKernelInfo use on the no-handler path
slawekptak Aug 29, 2025
1f95b9b
KernelName fix
slawekptak Aug 29, 2025
12ef6da
Update Windows symbols
slawekptak Aug 29, 2025
2980531
Split the kernel submit code into a command submission lambda and gen…
slawekptak Sep 1, 2025
01e0f9f
Fix formatting
slawekptak Sep 2, 2025
63d1345
Rename submit_generic_direct to submit_direct
slawekptak Sep 2, 2025
4001fea
Fix unused Props argument
slawekptak Sep 4, 2025
6c9525b
Update Linux symbols
slawekptak Sep 4, 2025
f871b10
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Sep 16, 2025
18df56b
Define the SubmitCommandFuncType template type and rebase
slawekptak Sep 16, 2025
3375e77
Use the KernelData structure in the no-handler path
slawekptak Sep 17, 2025
72dc199
Rename KernelRuntimeInfo to KernelDataDesc
slawekptak Sep 17, 2025
9715916
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Sep 17, 2025
177277b
Minor changes in the template variables
slawekptak Sep 17, 2025
eb9a5d6
Remove unused type
slawekptak Sep 17, 2025
1f8ea92
Remove KernelDataDesc and pass the arguments directly,
slawekptak Sep 22, 2025
74438ae
Merge branch 'sycl' into no_handler_lib_entry
slawekptak Sep 22, 2025
0e48eb6
Code cleanup
slawekptak Sep 22, 2025
0d47ac7
Update Linux symbols
slawekptak Sep 22, 2025
ab6812a
Update Windows symbols
slawekptak Sep 22, 2025
a35286d
Address review comments
slawekptak Sep 22, 2025
27a5cf5
Rvalue reference for KernelData
slawekptak Sep 22, 2025
9144f84
Add a static_assert to check if properties are empty.
slawekptak Sep 23, 2025
ca0b632
Remove unused function
slawekptak Sep 23, 2025
42e2b30
Fix unused parameter
slawekptak Sep 23, 2025
943f1f7
Merge two overloads of submit_kernel_direct_impl
slawekptak Sep 23, 2025
76bcaf2
Template instantiations for submit_kernel_direct_with_event_impl
slawekptak Sep 24, 2025
6588fe8
Change kernel direct submit functions in queue_impl to templates
slawekptak Sep 24, 2025
3c0e33c
Update Linux symbols
slawekptak Sep 24, 2025
8a20b8a
Update Windows symbols
slawekptak Sep 24, 2025
2be3d3d
Convert the kernel direct submit functions to free functions
slawekptak Sep 25, 2025
f139c93
Minor fixes
slawekptak Sep 25, 2025
8023ec1
Add missing calls.
slawekptak Sep 25, 2025
de94db6
Remove extern template definitions
slawekptak Sep 26, 2025
066b421
Consolidate the event-based and event-less functions
slawekptak Sep 26, 2025
eed0591
Make free functions from the queue kernel direct submit methods
slawekptak Sep 26, 2025
552f448
Address review comments
slawekptak Sep 26, 2025
f5c0d77
Update Linux symbols
slawekptak Sep 26, 2025
b442d37
ifdef fix
slawekptak Sep 26, 2025
5fa8ccc
Removed unused function declaration
slawekptak Sep 26, 2025
fa6d2f8
Export template instantiations
slawekptak Sep 26, 2025
f04ed3f
Update Windows symbols
slawekptak Sep 26, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 22 additions & 6 deletions sycl/cmake/modules/AddSYCLUnitTest.cmake
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Internal function to create SYCL unit tests with code reuse
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview file1.cpp, file2.cpp ...)
function(add_sycl_unittest_internal test_dirname link_variant is_preview)
# add_sycl_unittest_internal(test_dirname SHARED|OBJECT is_preview is_no_cgh file1.cpp, file2.cpp ...)
function(add_sycl_unittest_internal test_dirname link_variant is_preview is_no_cgh)
# Enable exception handling for these unit tests
set(LLVM_REQUIRES_EH ON)
set(LLVM_REQUIRES_RTTI ON)
Expand Down Expand Up @@ -34,7 +34,11 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
# Chaning CMAKE_CURRENT_BINARY_DIR should not affect this variable in its
# parent scope.
if (${is_preview})
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/Preview")
endif()

if (${is_no_cgh})
set(CMAKE_CURRENT_BINARY_DIR "${CMAKE_CURRENT_BINARY_DIR}/NoCGH")
endif()

if ("${link_variant}" MATCHES "SHARED")
Expand Down Expand Up @@ -65,6 +69,18 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
set(sycl_cache_suffix "_preview")
endif()

if (${is_no_cgh})
set(sycl_cache_suffix "_no_cgh")
endif()

if (${is_no_cgh})
target_compile_definitions(
${test_dirname}
PRIVATE
__DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
)
endif()

if (SYCL_ENABLE_XPTI_TRACING)
target_compile_definitions(${test_dirname}
PRIVATE XPTI_ENABLE_INSTRUMENTATION XPTI_STATIC_LIBRARY)
Expand Down Expand Up @@ -150,7 +166,6 @@ function(add_sycl_unittest_internal test_dirname link_variant is_preview)
-Wno-inconsistent-missing-override
)
endif()

target_compile_definitions(${test_dirname} PRIVATE SYCL_DISABLE_FSYCL_SYCLHPP_WARNING)
endfunction()

Expand All @@ -160,6 +175,7 @@ endfunction()
# the SYCL preview features enabled.
# Produces two binaries, named `basename(test_name_prefix_non_preview)` and `basename(test_name_prefix_preview)`
macro(add_sycl_unittest test_name_prefix link_variant)
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_non_preview ${link_variant} FALSE FALSE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_no_cgh ${link_variant} FALSE TRUE ${ARGN})
add_sycl_unittest_internal(${test_name_prefix}_preview ${link_variant} TRUE FALSE ${ARGN})
endmacro()
58 changes: 50 additions & 8 deletions sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,17 @@ event submit_with_event_impl(const queue &Q, PropertiesT Props,
const sycl::detail::code_location &CodeLoc) {
return Q.submit_with_event(Props, detail::type_erased_cgfo_ty{CGF}, CodeLoc);
}

template <typename KernelName, typename PropertiesT, typename KernelType,
int Dims>
void submit_kernel_direct_impl(const queue &Q, PropertiesT Props,
nd_range<Dims> Range,
const KernelType &KernelFunc,
const sycl::detail::code_location &CodeLoc) {
Q.submit_kernel_direct_without_event<KernelName, PropertiesT, KernelType,
Dims>(Props, Range, KernelFunc, CodeLoc);
}

} // namespace detail

template <typename CommandGroupFunc, typename PropertiesT>
Expand All @@ -126,6 +137,17 @@ void submit(const queue &Q, CommandGroupFunc &&CGF,
submit(Q, empty_properties_t{}, std::forward<CommandGroupFunc>(CGF), CodeLoc);
}

template <typename KernelName = sycl::detail::auto_name, typename PropertiesT,
typename KernelType, int Dims>
void submit(const queue &Q, PropertiesT Props, nd_range<Dims> Range,
const KernelType &KernelFunc,
const sycl::detail::code_location &CodeLoc =
sycl::detail::code_location::current()) {
sycl::ext::oneapi::experimental::detail::submit_kernel_direct_impl<
KernelName, PropertiesT, KernelType, Dims>(Q, Props, Range, KernelFunc,
CodeLoc);
}

template <typename CommandGroupFunc, typename PropertiesT>
event submit_with_event(const queue &Q, PropertiesT Props,
CommandGroupFunc &&CGF,
Expand Down Expand Up @@ -259,10 +281,18 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, nd_range<Dimensions> Range, const KernelType &KernelObj,
ReductionsT &&...Reductions) {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
if constexpr (sizeof...(ReductionsT) == 0) {
submit<KernelName>(std::move(Q), empty_properties_t{}, Range, KernelObj);
} else {
#endif
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Range, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
}
#endif
}

template <typename KernelName = sycl::detail::auto_name, int Dimensions,
Expand All @@ -283,10 +313,22 @@ template <typename KernelName = sycl::detail::auto_name, int Dimensions,
typename Properties, typename KernelType, typename... ReductionsT>
void nd_launch(queue Q, launch_config<nd_range<Dimensions>, Properties> Config,
const KernelType &KernelObj, ReductionsT &&...Reductions) {
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
if constexpr (sizeof...(ReductionsT) == 0) {
ext::oneapi::experimental::detail::LaunchConfigAccess<nd_range<Dimensions>,
Properties>
ConfigAccess(Config);
submit<KernelName>(std::move(Q), ConfigAccess.getProperties(),
ConfigAccess.getRange(), KernelObj);
} else {
#endif
submit(std::move(Q), [&](handler &CGH) {
nd_launch<KernelName>(CGH, Config, KernelObj,
std::forward<ReductionsT>(Reductions)...);
});
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
}
#endif
}

template <int Dimensions, typename... ArgsT>
Expand Down
15 changes: 15 additions & 0 deletions sycl/include/sycl/khr/free_function_commands.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,27 +153,42 @@ void launch_grouped(const queue &q, range<1> r, range<1> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
submit(q, ext::oneapi::experimental::empty_properties_t{},
nd_range<1>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<2> r, range<2> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
submit(q, ext::oneapi::experimental::empty_properties_t{},
nd_range<2>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}
template <typename KernelType>
void launch_grouped(const queue &q, range<3> r, range<3> size,
const KernelType &k,
const sycl::detail::code_location &codeLoc =
sycl::detail::code_location::current()) {
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
submit(q, ext::oneapi::experimental::empty_properties_t{},
nd_range<3>(r, size), k);
#else
submit(
q, [&](handler &h) { launch_grouped<KernelType>(h, r, size, k); },
codeLoc);
#endif
}

template <typename... Args>
Expand Down
140 changes: 135 additions & 5 deletions sycl/include/sycl/queue.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,14 @@ template <typename CommandGroupFunc, typename PropertiesT>
event submit_with_event_impl(const queue &Q, PropertiesT Props,
CommandGroupFunc &&CGF,
const sycl::detail::code_location &CodeLoc);

template <typename KernelName, typename PropertiesT, typename KernelType,
int Dims>
void submit_kernel_direct_impl(const queue &Q, PropertiesT Props,
nd_range<Dims> Range,
const KernelType &KernelFunc,
const sycl::detail::code_location &CodeLoc);

} // namespace detail
} // namespace ext::oneapi::experimental

Expand Down Expand Up @@ -3203,11 +3211,20 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
parallel_for(nd_range<Dims> Range, RestT &&...Rest) {
constexpr detail::code_location CodeLoc = getCodeLocation<KernelName>();
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);
return submit(
[&](handler &CGH) {
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
if constexpr (sizeof...(RestT) == 1) {
return submit_kernel_direct_with_event<KernelName>(
ext::oneapi::experimental::empty_properties_t{}, Range, Rest...);
} else {
#endif
return submit(
[&](handler &CGH) {
CGH.template parallel_for<KernelName>(Range, Rest...);
},
TlsCodeLocCapture.query());
#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT
}
#endif
}

/// parallel_for version with a kernel represented as a lambda + nd_range that
Expand Down Expand Up @@ -3586,6 +3603,12 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
const queue &Q, PropertiesT Props, CommandGroupFunc &&CGF,
const sycl::detail::code_location &CodeLoc);

template <typename KernelName, typename PropertiesT, typename KernelType,
int Dims>
friend void ext::oneapi::experimental::detail::submit_kernel_direct_impl(
const queue &Q, PropertiesT Props, nd_range<Dims> Range,
const KernelType &KernelFunc, const sycl::detail::code_location &CodeLoc);

template <typename PropertiesT>
void ProcessSubmitProperties(PropertiesT Props,
detail::v1::SubmissionInfo &SI) const {
Expand Down Expand Up @@ -3670,6 +3693,36 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
const detail::code_location &CodeLoc,
bool IsTopCodeLoc) const;

event submit_kernel_direct_with_event_impl(
nd_range<1> Range, std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) const;

event submit_kernel_direct_with_event_impl(
nd_range<2> Range, std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) const;

event submit_kernel_direct_with_event_impl(
nd_range<3> Range, std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) const;

void submit_kernel_direct_without_event_impl(
nd_range<1> Range, std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) const;

void submit_kernel_direct_without_event_impl(
nd_range<2> Range, std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) const;

void submit_kernel_direct_without_event_impl(
nd_range<3> Range, std::shared_ptr<detail::HostKernelBase> &HostKernel,
detail::DeviceKernelInfo *DeviceKernelInfo,
const detail::code_location &CodeLoc, bool IsTopCodeLoc) const;

/// A template-free version of submit_without_event as const member function.
void submit_without_event_impl(const detail::type_erased_cgfo_ty &CGH,
const detail::v1::SubmissionInfo &SubmitInfo,
Expand Down Expand Up @@ -3699,6 +3752,83 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase<queue> {
TlsCodeLocCapture.isToplevel());
}

template <typename KernelName = detail::auto_name, typename PropertiesT,
typename KernelType, int Dims>
event
submit_kernel_direct_with_event(PropertiesT Props, nd_range<Dims> Range,
const KernelType &KernelFunc,
const detail::code_location &CodeLoc =
detail::code_location::current()) const {
// TODO Properties not supported yet
(void)Props;
static_assert(
std::is_same_v<PropertiesT,
ext::oneapi::experimental::empty_properties_t>,
"Setting properties not supported yet for no-CGH kernel submit.");
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);

using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
using LambdaArgType =
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
static_assert(
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
"must be either sycl::nd_item or be convertible from sycl::nd_item");
using TransformedArgType = sycl::nd_item<Dims>;

std::shared_ptr<detail::HostKernelBase> HostKernel = std::make_shared<
detail::HostKernel<KernelType, TransformedArgType, Dims>>(KernelFunc);

detail::DeviceKernelInfo *DeviceKernelInfoPtr =
&detail::getDeviceKernelInfo<NameT>();

detail::KernelWrapper<detail::WrapAs::parallel_for, NameT, KernelType,
TransformedArgType, PropertiesT>::wrap(KernelFunc);

return submit_kernel_direct_with_event_impl(
Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(),
TlsCodeLocCapture.isToplevel());
}

template <typename KernelName = detail::auto_name, typename PropertiesT,
typename KernelType, int Dims>
void submit_kernel_direct_without_event(
PropertiesT Props, nd_range<Dims> Range, const KernelType &KernelFunc,
const detail::code_location &CodeLoc =
detail::code_location::current()) const {
// TODO Properties not supported yet
(void)Props;
static_assert(
std::is_same_v<PropertiesT,
ext::oneapi::experimental::empty_properties_t>,
"Setting properties not supported yet for no-CGH kernel submit.");
detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc);

using NameT =
typename detail::get_kernel_name_t<KernelName, KernelType>::name;
using LambdaArgType =
sycl::detail::lambda_arg_type<KernelType, nd_item<Dims>>;
static_assert(
std::is_convertible_v<sycl::nd_item<Dims>, LambdaArgType>,
"Kernel argument of a sycl::parallel_for with sycl::nd_range "
"must be either sycl::nd_item or be convertible from sycl::nd_item");
using TransformedArgType = sycl::nd_item<Dims>;

std::shared_ptr<detail::HostKernelBase> HostKernel = std::make_shared<
detail::HostKernel<KernelType, TransformedArgType, Dims>>(KernelFunc);

detail::DeviceKernelInfo *DeviceKernelInfoPtr =
&detail::getDeviceKernelInfo<NameT>();

detail::KernelWrapper<detail::WrapAs::parallel_for, NameT, KernelType,
TransformedArgType, PropertiesT>::wrap(KernelFunc);

submit_kernel_direct_without_event_impl(
Range, HostKernel, DeviceKernelInfoPtr, TlsCodeLocCapture.query(),
TlsCodeLocCapture.isToplevel());
}

/// Submits a command group function object to the queue, in order to be
/// scheduled for execution on the device.
///
Expand Down
Loading