diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index e393bd626d4d6..8c8488a99e354 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -152,9 +152,21 @@ template void single_task(queue Q, const KernelType &KernelObj, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - submit( - std::move(Q), - [&](handler &CGH) { single_task(CGH, KernelObj); }, CodeLoc); + // TODO The handler-less path does not support kernel function properties + // and kernel functions with the kernel_handler type argument yet. + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + detail::submit_kernel_direct_single_task( + std::move(Q), empty_properties_t{}, KernelObj, CodeLoc); + } else { + submit( + std::move(Q), + [&](handler &CGH) { single_task(CGH, KernelObj); }, + CodeLoc); + } } template @@ -268,8 +280,8 @@ void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, - Range, KernelObj); + detail::submit_kernel_direct_parallel_for( + std::move(Q), empty_properties_t{}, Range, KernelObj); } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index b04fac17a6f9c..68dd159bf8211 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -164,7 +164,7 @@ void launch_grouped(const queue &q, range<1> r, range<1> size, KernelType &&k, const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<1>>::value)) { - detail::submit_kernel_direct( + detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), std::forward(k)); } else { @@ -185,7 +185,7 @@ void launch_grouped(const queue &q, range<2> r, range<2> size, KernelType &&k, const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<2>>::value)) { - detail::submit_kernel_direct( + detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), std::forward(k)); } else { @@ -206,7 +206,7 @@ void launch_grouped(const queue &q, range<3> r, range<3> size, KernelType &&k, const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item<3>>::value)) { - detail::submit_kernel_direct( + detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), std::forward(k)); } else { @@ -319,11 +319,24 @@ void launch_task(handler &h, const KernelType &k) { h.single_task(k); } -template -void launch_task(const sycl::queue &q, const KernelType &k, +template >> +void launch_task(const sycl::queue &q, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit(q, [&](handler &h) { launch_task(h, k); }, codeLoc); + // TODO The handler-less path does not support kernel function properties + // and kernel functions with the kernel_handler type argument yet. + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + detail::submit_kernel_direct_single_task( + q, ext::oneapi::experimental::empty_properties_t{}, + std::forward(k), codeLoc); + } else { + submit(q, [&](handler &h) { launch_task(h, k); }, codeLoc); + } } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 755b07f39bbf3..4a7f1fac789a3 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -157,14 +157,14 @@ class __SYCL_EXPORT SubmissionInfo { } // namespace v1 -template auto submit_kernel_direct( - const queue &Queue, PropertiesT Props, const nd_range &Range, - KernelTypeUniversalRef &&KernelFunc, + const queue &Queue, [[maybe_unused]] PropertiesT Props, + const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) { // TODO Properties not supported yet - (void)Props; static_assert( std::is_same_v, @@ -176,34 +176,39 @@ auto submit_kernel_direct( using NameT = typename detail::get_kernel_name_t::name; - using LambdaArgType = - sycl::detail::lambda_arg_type>; - static_assert( - std::is_convertible_v, LambdaArgType>, - "Kernel argument of a sycl::parallel_for with sycl::nd_range " - "must be either sycl::nd_item or be convertible from sycl::nd_item"); - using TransformedArgType = sycl::nd_item; - -#ifndef __SYCL_DEVICE_ONLY__ - detail::checkValueRange(Range); -#endif - detail::KernelWrapper::wrap(KernelFunc); + detail::KernelWrapper::wrap(KernelFunc); - HostKernelRef + HostKernelRef HostKernel(std::forward(KernelFunc)); // Instantiating the kernel on the host improves debugging. // Passing this pointer to another translation unit prevents optimization. #ifndef NDEBUG - // TODO: call library to prevent dropping call due to optimization + // TODO: call library to prevent dropping call due to optimization. (void) detail::GetInstantiateKernelOnHostPtr(); #endif detail::DeviceKernelInfo *DeviceKernelInfoPtr = &detail::getDeviceKernelInfo(); + constexpr auto Info = detail::CompileTimeKernelInfo; + + assert(Info.Name != std::string_view{} && "Kernel must have a name!"); + + static_assert( + Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, + "Unexpected kernel lambda size. This can be caused by an " + "external host compiler producing a lambda with an " + "unexpected layout. This is a limitation of the compiler." + "In many cases the difference is related to capturing constexpr " + "variables. In such cases removing constexpr specifier aligns the " + "captures between the host compiler and the device compiler." + "\n" + "In case of MSVC, passing " + "-fsycl-host-compiler-options='/std:c++latest' " + "might also help."); if constexpr (EventNeeded) { return submit_kernel_direct_with_event_impl( @@ -216,6 +221,48 @@ auto submit_kernel_direct( } } +template +auto submit_kernel_direct_parallel_for( + const queue &Queue, PropertiesT Props, const nd_range &Range, + KernelTypeUniversalRef &&KernelFunc, + const detail::code_location &CodeLoc = detail::code_location::current()) { + + using KernelType = + std::remove_const_t>; + + using LambdaArgType = + sycl::detail::lambda_arg_type>; + static_assert( + std::is_convertible_v, LambdaArgType>, + "Kernel argument of a sycl::parallel_for with sycl::nd_range " + "must be either sycl::nd_item or be convertible from sycl::nd_item"); + using TransformedArgType = sycl::nd_item; + +#ifndef __SYCL_DEVICE_ONLY__ + detail::checkValueRange(Range); +#endif + + return submit_kernel_direct( + Queue, Props, Range, std::forward(KernelFunc), + CodeLoc); +} + +template +auto submit_kernel_direct_single_task( + const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc, + const detail::code_location &CodeLoc = detail::code_location::current()) { + + return submit_kernel_direct( + Queue, Props, nd_range<1>{1, 1}, + std::forward(KernelFunc), CodeLoc); +} + } // namespace detail namespace ext ::oneapi ::experimental { @@ -2727,12 +2774,27 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "Use queue.submit() instead"); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template single_task( - Properties, KernelFunc); - }, - TlsCodeLocCapture.query()); + + // TODO The handler-less path does not support kernel + // function properties and kernel functions with the kernel_handler + // type argument yet. + if constexpr ( + std::is_same_v && + !(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + return detail::submit_kernel_direct_single_task( + *this, ext::oneapi::experimental::empty_properties_t{}, KernelFunc, + TlsCodeLocCapture.query()); + } else { + return submit( + [&](handler &CGH) { + CGH.template single_task( + Properties, KernelFunc); + }, + TlsCodeLocCapture.query()); + } } /// single_task version with a kernel represented as a lambda. @@ -3291,7 +3353,7 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { const KernelType &>::value) && !(detail::KernelLambdaHasKernelHandlerArgT< KernelType, sycl::nd_item>::value)) { - return detail::submit_kernel_direct( + return detail::submit_kernel_direct_parallel_for( *this, ext::oneapi::experimental::empty_properties_t{}, Range, Rest..., TlsCodeLocCapture.query()); } else { diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp index 3c75d577227b3..82f8477a10962 100644 --- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp +++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp @@ -36,7 +36,8 @@ int main(int argc, char **argv) { kernel<2> krn2; q.single_task(krn2); - assert(copy_count == 1); + // The kernel is copied on the scheduler-based path only. + assert(copy_count == 0); assert(move_count == 0); copy_count = 0; diff --git a/sycl/test/basic_tests/kernel_size_mismatch.cpp b/sycl/test/basic_tests/kernel_size_mismatch.cpp index 03ba5a7983bf4..abe16ca02929e 100644 --- a/sycl/test/basic_tests/kernel_size_mismatch.cpp +++ b/sycl/test/basic_tests/kernel_size_mismatch.cpp @@ -13,7 +13,7 @@ int main() { (void)A; // expected-no-diagnostics #else - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}} + // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}} #endif }).wait(); } diff --git a/sycl/test/basic_tests/single_task_error_message.cpp b/sycl/test/basic_tests/single_task_error_message.cpp index 7389296645d63..b086c81e4f2af 100644 --- a/sycl/test/basic_tests/single_task_error_message.cpp +++ b/sycl/test/basic_tests/single_task_error_message.cpp @@ -12,6 +12,9 @@ int main() { .single_task([&](sycl::handler &cgh) { // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}} // expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}} + // TODO Investigate why this function template is not instantiated + // (if this is expected). + // expected-error@sycl/detail/cg_types.hpp:* {{no matching function for call to 'runKernelWithoutArg'}} }) .wait(); } diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp index be5302a1c12a4..be124f4452f54 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp @@ -78,6 +78,7 @@ class FreeFunctionCommandsEventsTests : public ::testing::Test { protected: void SetUp() override { counter_urEnqueueKernelLaunch = 0; + counter_urEnqueueKernelLaunchWithEvent = 0; counter_urUSMEnqueueMemcpy = 0; counter_urUSMEnqueueFill = 0; counter_urUSMEnqueuePrefetch = 0; @@ -281,6 +282,57 @@ TEST_F(FreeFunctionCommandsEventsTests, ASSERT_EQ(counter_urEnqueueKernelLaunchWithEvent, size_t{1}); } +TEST_F(FreeFunctionCommandsEventsTests, LaunchTaskShortcutMoveKernel) { + mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", + &redefined_urEnqueueKernelLaunch); + + TestMoveFunctor::MoveCtorCalls = 0; + TestMoveFunctor MoveOnly; + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + // This kernel submission uses scheduler-bypass path, so the HostKernel + // shouldn't be constructed. + + sycl::khr::launch_task(Queue, std::move(MoveOnly)); + + ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0); + ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + // Another kernel submission is queued behind a host task, + // to force the scheduler-based submission. In this case, the HostKernel + // should be constructed. + + // Replace the callback with an event based one, since the scheduler + // needs to create an event internally + mock::getCallbacks().set_replace_callback( + "urEnqueueKernelLaunch", &redefined_urEnqueueKernelLaunchWithEvent); + + Queue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + }); + }); + + sycl::khr::launch_task(Queue, std::move(MoveOnly)); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + Queue.wait(); + + // Move ctor for TestMoveFunctor is called during move construction of + // HostKernel. Copy ctor is called by InstantiateKernelOnHost, can't delete + // it. + ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1); + ASSERT_EQ(counter_urEnqueueKernelLaunchWithEvent, size_t{1}); +} + TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) { mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", &redefined_urEnqueueKernelLaunch);