diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index fc6519863daa5..a2a1d43faa47c 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -152,9 +152,19 @@ template void single_task(queue Q, const KernelType &KernelObj, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - submit( - std::move(Q), - [&](handler &CGH) { single_task(CGH, KernelObj); }, CodeLoc); + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + detail::submit_kernel_direct_single_task( + std::move(Q), empty_properties_t{}, KernelObj, CodeLoc); + } else { + submit( + std::move(Q), + [&](handler &CGH) { single_task(CGH, KernelObj); }, + CodeLoc); + } } template @@ -259,18 +269,17 @@ template void nd_launch(queue Q, nd_range Range, const KernelType &KernelObj, ReductionsT &&...Reductions) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support reductions and kernel function // properties yet. if constexpr (sizeof...(ReductionsT) == 0 && !(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< - const KernelType &>::value)) { - detail::submit_kernel_direct(std::move(Q), empty_properties_t{}, + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + detail::submit_kernel_direct_parallel_for(std::move(Q), empty_properties_t{}, Range, KernelObj); - } else -#endif - { + } else { submit(std::move(Q), [&](handler &CGH) { nd_launch(CGH, Range, KernelObj, std::forward(Reductions)...); diff --git a/sycl/include/sycl/khr/free_function_commands.hpp b/sycl/include/sycl/khr/free_function_commands.hpp index 9ecd30c881c89..f41d2dd5f91f0 100644 --- a/sycl/include/sycl/khr/free_function_commands.hpp +++ b/sycl/include/sycl/khr/free_function_commands.hpp @@ -157,17 +157,16 @@ template r, range<1> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support kernel function properties yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< - const KernelType &>::value)) { - detail::submit_kernel_direct( + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item<1>>::value)) { + detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<1>(r, size), std::forward(k)); - } else -#endif - { + } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); @@ -178,17 +177,16 @@ template r, range<2> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support kernel function properties yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< - const KernelType &>::value)) { - detail::submit_kernel_direct( + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item<2>>::value)) { + detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<2>(r, size), std::forward(k)); - } else -#endif - { + } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); @@ -199,17 +197,16 @@ template r, range<3> size, KernelType &&k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT // TODO The handler-less path does not support kernel function properties yet. if constexpr (!(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< - const KernelType &>::value)) { - detail::submit_kernel_direct( + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item<3>>::value)) { + detail::submit_kernel_direct_parallel_for( q, ext::oneapi::experimental::empty_properties_t{}, nd_range<3>(r, size), std::forward(k)); - } else -#endif - { + } else { submit( q, [&](handler &h) { launch_grouped(h, r, size, k); }, codeLoc); @@ -323,7 +320,17 @@ template void launch_task(const sycl::queue &q, const KernelType &k, const sycl::detail::code_location &codeLoc = sycl::detail::code_location::current()) { - submit(q, [&](handler &h) { launch_task(h, k); }, codeLoc); + if constexpr (!(ext::oneapi::experimental::detail:: + HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + detail::submit_kernel_direct_single_task( + q, ext::oneapi::experimental::empty_properties_t{}, k, codeLoc); + } else { + submit( + q, [&](handler &h) { launch_task(h, k); }, codeLoc); + } } template diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index f6dce0d01accc..3e83f01b236a5 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -158,7 +158,7 @@ class __SYCL_EXPORT SubmissionInfo { template -auto submit_kernel_direct( +auto submit_kernel_direct_parallel_for( const queue &Queue, PropertiesT Props, const nd_range &Range, KernelTypeUniversalRef &&KernelFunc, const detail::code_location &CodeLoc = detail::code_location::current()) { @@ -199,6 +199,22 @@ auto submit_kernel_direct( detail::DeviceKernelInfo *DeviceKernelInfoPtr = &detail::getDeviceKernelInfo(); + constexpr auto Info = detail::CompileTimeKernelInfo; + + assert(Info.Name != std::string_view{} && "Kernel must have a name!"); + + static_assert( + Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, + "Unexpected kernel lambda size. This can be caused by an " + "external host compiler producing a lambda with an " + "unexpected layout. This is a limitation of the compiler." + "In many cases the difference is related to capturing constexpr " + "variables. In such cases removing constexpr specifier aligns the " + "captures between the host compiler and the device compiler." + "\n" + "In case of MSVC, passing " + "-fsycl-host-compiler-options='/std:c++latest' " + "might also help."); if constexpr (EventNeeded) { return submit_kernel_direct_with_event_impl( @@ -211,6 +227,69 @@ auto submit_kernel_direct( } } +template +auto submit_kernel_direct_single_task( + const queue &Queue, PropertiesT Props, KernelTypeUniversalRef &&KernelFunc, + const detail::code_location &CodeLoc = detail::code_location::current()) { + // TODO Properties not supported yet + (void)Props; + static_assert( + std::is_same_v, + "Setting properties not supported yet for no-CGH kernel submit."); + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + + using KernelType = + std::remove_const_t>; + + using NameT = + typename detail::get_kernel_name_t::name; + + detail::KernelWrapper::wrap(KernelFunc); + + HostKernelRef + HostKernel(std::forward(KernelFunc)); + + // Instantiating the kernel on the host improves debugging. + // Passing this pointer to another translation unit prevents optimization. +#ifndef NDEBUG + // TODO: call library to prevent dropping call due to optimization + (void) + detail::GetInstantiateKernelOnHostPtr(); +#endif + + detail::DeviceKernelInfo *DeviceKernelInfoPtr = + &detail::getDeviceKernelInfo(); + constexpr auto Info = detail::CompileTimeKernelInfo; + + assert(Info.Name != std::string_view{} && "Kernel must have a name!"); + + static_assert( + Info.Name == std::string_view{} || sizeof(KernelType) == Info.KernelSize, + "Unexpected kernel lambda size. This can be caused by an " + "external host compiler producing a lambda with an " + "unexpected layout. This is a limitation of the compiler." + "In many cases the difference is related to capturing constexpr " + "variables. In such cases removing constexpr specifier aligns the " + "captures between the host compiler and the device compiler." + "\n" + "In case of MSVC, passing " + "-fsycl-host-compiler-options='/std:c++latest' " + "might also help."); + + if constexpr (EventNeeded) { + return submit_kernel_direct_with_event_impl( + Queue, nd_range<1>{1, 1}, HostKernel, DeviceKernelInfoPtr, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } else { + submit_kernel_direct_without_event_impl( + Queue, nd_range<1>{1, 1}, HostKernel, DeviceKernelInfoPtr, + TlsCodeLocCapture.query(), TlsCodeLocCapture.isToplevel()); + } +} + } // namespace detail namespace ext ::oneapi ::experimental { @@ -2721,13 +2800,25 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { "sycl::queue.single_task() requires a kernel instead of command group. " "Use queue.submit() instead"); - detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); - return submit( - [&](handler &CGH) { - CGH.template single_task( - Properties, KernelFunc); - }, - TlsCodeLocCapture.query()); + if constexpr ( + std::is_same_v && + !(ext::oneapi::experimental::detail::HasKernelPropertiesGetMethod< + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT::value)) { + (void)Properties; + return detail::submit_kernel_direct_single_task( + *this, ext::oneapi::experimental::empty_properties_t{}, + KernelFunc, CodeLoc); + } else { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return submit( + [&](handler &CGH) { + CGH.template single_task( + Properties, KernelFunc); + }, + TlsCodeLocCapture.query()); + } } /// single_task version with a kernel represented as a lambda. @@ -3275,7 +3366,6 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { parallel_for(nd_range Range, RestT &&...Rest) { constexpr detail::code_location CodeLoc = getCodeLocation(); detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); -#ifdef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT using KernelType = std::tuple_element_t<0, std::tuple>; // TODO The handler-less path does not support reductions and kernel @@ -3283,13 +3373,13 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { if constexpr (sizeof...(RestT) == 1 && !(ext::oneapi::experimental::detail:: HasKernelPropertiesGetMethod< - const KernelType &>::value)) { - return detail::submit_kernel_direct( + const KernelType &>::value) && + !(detail::KernelLambdaHasKernelHandlerArgT< + KernelType, sycl::nd_item>::value)) { + return detail::submit_kernel_direct_parallel_for( *this, ext::oneapi::experimental::empty_properties_t{}, Range, Rest...); - } else -#endif - { + } else { return submit( [&](handler &CGH) { CGH.template parallel_for(Range, Rest...); diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index b8c2a2866a625..0ac1a4dd0d5e1 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -662,6 +662,14 @@ queue_impl::submit_direct(bool CallerNeedsEvent, : true) && !hasCommandGraph(); + if (isInOrder()) { + if (SchedulerBypass) { + MNoLastEventMode.store(true, std::memory_order_relaxed); + } else { + MNoLastEventMode.store(false, std::memory_order_relaxed); + } + } + EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass); // Sync with the last event for in order queue. For scheduler-bypass flow, diff --git a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp index 7c1781e873a39..8f90defcb9566 100644 --- a/sycl/test-e2e/Basic/test_num_kernel_copies.cpp +++ b/sycl/test-e2e/Basic/test_num_kernel_copies.cpp @@ -29,13 +29,13 @@ int main(int argc, char **argv) { kernel<1> krn1; q.parallel_for(sycl::nd_range<1>{1, 1}, krn1); - assert(copy_count == 1); + assert(copy_count == 0); assert(move_count == 0); copy_count = 0; kernel<2> krn2; q.single_task(krn2); - assert(copy_count == 1); + assert(copy_count == 0); assert(move_count == 0); copy_count = 0; diff --git a/sycl/test/basic_tests/kernel_size_mismatch.cpp b/sycl/test/basic_tests/kernel_size_mismatch.cpp index 03ba5a7983bf4..abe16ca02929e 100644 --- a/sycl/test/basic_tests/kernel_size_mismatch.cpp +++ b/sycl/test/basic_tests/kernel_size_mismatch.cpp @@ -13,7 +13,7 @@ int main() { (void)A; // expected-no-diagnostics #else - // expected-error-re@sycl/handler.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}} + // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': Unexpected kernel lambda size. This can be caused by an external host compiler producing a lambda with an unexpected layout. This is a limitation of the compiler.}} #endif }).wait(); } diff --git a/sycl/test/basic_tests/single_task_error_message.cpp b/sycl/test/basic_tests/single_task_error_message.cpp index 7389296645d63..34f8b4bef08d0 100644 --- a/sycl/test/basic_tests/single_task_error_message.cpp +++ b/sycl/test/basic_tests/single_task_error_message.cpp @@ -2,20 +2,6 @@ #include #include int main() { - { - int varA = 42; - int varB = 42; - int sum = 0; - sycl::queue myQueue{}; - { - myQueue - .single_task([&](sycl::handler &cgh) { - // expected-error-re@sycl/queue.hpp:* {{static assertion failed due to requirement '{{.*}}': sycl::queue.single_task() requires a kernel instead of command group.{{.*}} Use queue.submit() instead}} - // expected-error-re@sycl/detail/cg_types.hpp:* {{no matching function for call to object of type '(lambda at {{.*}}single_task_error_message.cpp:{{.*}})'}} - }) - .wait(); - } - } { int varA = 42; int varB = 42; diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp index ea523283064ea..ca0753013c03b 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp @@ -227,7 +227,6 @@ TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutNoEvent) { ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); } -#if __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT TEST_F(FreeFunctionCommandsEventsTests, LaunchGroupedShortcutMoveKernelNoEvent) { mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", @@ -276,7 +275,6 @@ TEST_F(FreeFunctionCommandsEventsTests, ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2}); } -#endif TEST_F(FreeFunctionCommandsEventsTests, SubmitLaunchGroupedKernelNoEvent) { mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp index e0a5c9be50c15..c275f7d03cc9d 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionEventsHelpers.hpp @@ -26,53 +26,53 @@ inline ur_result_t after_urKernelGetInfo(void *pParams) { static thread_local size_t counter_urEnqueueKernelLaunch = 0; inline ur_result_t redefined_urEnqueueKernelLaunch(void *pParams) { ++counter_urEnqueueKernelLaunch; -// TODO The no-handler scheduler submission includes a fix for the event return, -// where the event is returned by the scheduler on every submission. This fix -// is not yet applied to the handler-based path. -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // TODO The no-handler scheduler submission includes a fix for the event + // return, where the event is returned by the scheduler on every submission. + // This fix is not yet applied to the handler-based path. #ifndef + // __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemcpy = 0; inline ur_result_t redefined_urUSMEnqueueMemcpy(void *pParams) { ++counter_urUSMEnqueueMemcpy; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueFill = 0; inline ur_result_t redefined_urUSMEnqueueFill(void *pParams) { ++counter_urUSMEnqueueFill; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueuePrefetch = 0; inline ur_result_t redefined_urUSMEnqueuePrefetch(void *pParams) { ++counter_urUSMEnqueuePrefetch; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; } static thread_local size_t counter_urUSMEnqueueMemAdvise = 0; inline ur_result_t redefined_urUSMEnqueueMemAdvise(void *pParams) { ++counter_urUSMEnqueueMemAdvise; -#ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT - auto params = *static_cast(pParams); - EXPECT_EQ(*params.pphEvent, nullptr); -#endif + // #ifndef __DPCPP_ENABLE_UNFINISHED_NO_CGH_SUBMIT + // auto params = *static_cast(pParams); + // EXPECT_EQ(*params.pphEvent, nullptr); + // #endif return UR_RESULT_SUCCESS; }