diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index f3a35961e3dd2..f9440c089f2a0 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -511,7 +511,7 @@ event handler::finalize() { // TODO checking the size of the events vector and avoiding the call is more // efficient here at this point - const bool KernelFastPath = + const bool KernelSchedulerBypass = (Queue && !Graph && !impl->MSubgraphNode && !Queue->hasCommandGraph() && !impl->CGData.MRequirements.size() && !MStreamStorage.size() && (impl->CGData.MEvents.size() == 0 || @@ -521,7 +521,7 @@ event handler::finalize() { // Extract arguments from the kernel lambda, if required. // Skipping this is currently limited to simple kernels on the fast path. if (type == detail::CGType::Kernel && impl->MKernelData.getKernelFuncPtr() && - (!KernelFastPath || impl->MKernelData.hasSpecialCaptures())) { + (!KernelSchedulerBypass || impl->MKernelData.hasSpecialCaptures())) { impl->MKernelData.extractArgsAndReqsFromLambda(); } @@ -633,7 +633,7 @@ event handler::finalize() { } } - if (KernelFastPath) { + if (KernelSchedulerBypass) { // if user does not add a new dependency to the dependency graph, i.e. // the graph is not changed, then this faster path is used to submit // kernel bypassing scheduler and avoiding CommandGroup, Command objects @@ -879,9 +879,18 @@ event handler::finalize() { #endif } - bool DiscardEvent = !impl->MEventNeeded && Queue && - Queue->supportsDiscardingPiEvents() && - CommandGroup->getRequirements().size() == 0; + // For kernel submission, regardless of whether an event has been requested, + // the scheduler needs to generate an event so the commands are properly + // ordered (for in-order queue) and synchronized with a barrier (for + // out-of-order queue). The event can only be skipped for the scheduler bypass + // path. + // + // For commands other than kernel submission, if an event has not been + // requested, the queue supports events discarding, and the scheduler + // could have been bypassed (not supported yet), the event can be skipped. + bool DiscardEvent = + (type != detail::CGType::Kernel && KernelSchedulerBypass && + !impl->MEventNeeded && Queue->supportsDiscardingPiEvents()); detail::EventImplPtr Event = detail::Scheduler::getInstance().addCG( std::move(CommandGroup), *Queue, !DiscardEvent); diff --git a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp index c79c6ecdbba51..72db8cc55ab4c 100644 --- a/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp +++ b/sycl/unittests/scheduler/InOrderQueueHostTaskDeps.cpp @@ -21,6 +21,8 @@ using namespace sycl; +namespace oneapiext = ext::oneapi::experimental; + size_t GEventsWaitCounter = 0; inline ur_result_t redefinedEventsWaitWithBarrier(void *pParams) { @@ -49,20 +51,20 @@ TEST_F(SchedulerTest, InOrderQueueHostTaskDeps) { } enum class CommandType { KERNEL = 1, MEMSET = 2, HOST_TASK = 3 }; -std::vector> ExecutedCommands; +std::vector> ExecutedCommands; inline ur_result_t customEnqueueKernelLaunchWithArgsExp(void *pParams) { auto params = *static_cast(pParams); - ExecutedCommands.push_back( - {CommandType::KERNEL, *params.pnumEventsInWaitList}); + ExecutedCommands.push_back({CommandType::KERNEL, *params.pnumEventsInWaitList, + **params.ppGlobalWorkSize}); return UR_RESULT_SUCCESS; } inline ur_result_t customEnqueueUSMFill(void *pParams) { auto params = *static_cast(pParams); ExecutedCommands.push_back( - {CommandType::MEMSET, *params.pnumEventsInWaitList}); + {CommandType::MEMSET, *params.pnumEventsInWaitList, 0}); return UR_RESULT_SUCCESS; } @@ -92,6 +94,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { CGH.host_task([&] { std::unique_lock lk(CvMutex); Cv.wait(lk, [&ready] { return ready; }); + ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0}); }); }); @@ -113,11 +116,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDeps) { InOrderQueue.wait(); - ASSERT_EQ(ExecutedCommands.size(), 2u); - EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET); - EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); - EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); - EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); + ASSERT_EQ(ExecutedCommands.size(), 3u); + EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/, + CommandType::HOST_TASK); + EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/, + CommandType::MEMSET); + EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u); + EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/, + CommandType::KERNEL); + EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u); } TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { @@ -142,6 +149,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { CGH.host_task([&] { std::unique_lock lk(CvMutex); Cv.wait(lk, [&ready] { return ready; }); + ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0}); }); }); @@ -159,11 +167,15 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncs) { InOrderQueue.wait(); - ASSERT_EQ(ExecutedCommands.size(), 2u); - EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::MEMSET); - EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); - EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); - EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); + ASSERT_EQ(ExecutedCommands.size(), 3u); + EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/, + CommandType::HOST_TASK); + EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/, + CommandType::MEMSET); + EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u); + EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/, + CommandType::KERNEL); + EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u); } TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { @@ -186,7 +198,7 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { CGH.host_task([&] { std::unique_lock lk(CvMutex); Cv.wait(lk, [&ready] { return ready; }); - ExecutedCommands.push_back({CommandType::HOST_TASK, 0}); + ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0}); }); }); @@ -202,8 +214,61 @@ TEST_F(SchedulerTest, InOrderQueueCrossDepsShortcutFuncsParallelFor) { InOrderQueue.wait(); ASSERT_EQ(ExecutedCommands.size(), 2u); - EXPECT_EQ(ExecutedCommands[0].first /*CommandType*/, CommandType::HOST_TASK); - EXPECT_EQ(ExecutedCommands[0].second /*EventsCount*/, 0u); - EXPECT_EQ(ExecutedCommands[1].first /*CommandType*/, CommandType::KERNEL); - EXPECT_EQ(ExecutedCommands[1].second /*EventsCount*/, 0u); + EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/, + CommandType::HOST_TASK); + EXPECT_EQ(std::get<1>(ExecutedCommands[0]) /*EventsCount*/, 0u); + EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/, + CommandType::KERNEL); + EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u); +} + +TEST_F(SchedulerTest, InOrderQueueCrossDepsEnqueueFunctions) { + ExecutedCommands.clear(); + sycl::unittest::UrMock<> Mock; + mock::getCallbacks().set_before_callback( + "urEnqueueKernelLaunchWithArgsExp", + &customEnqueueKernelLaunchWithArgsExp); + + sycl::platform Plt = sycl::platform(); + + context Ctx{Plt}; + queue InOrderQueue{Ctx, default_selector_v, property::queue::in_order()}; + + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + InOrderQueue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + ExecutedCommands.push_back({CommandType::HOST_TASK, 0, 0}); + }); + }); + + oneapiext::nd_launch( + InOrderQueue, nd_range<1>{range<1>{32}, range<1>{32}}, [](nd_item<1>) {}); + + oneapiext::nd_launch( + InOrderQueue, nd_range<1>{range<1>{64}, range<1>{32}}, [](nd_item<1>) {}); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + InOrderQueue.wait(); + + ASSERT_EQ(ExecutedCommands.size(), 3u); + EXPECT_EQ(std::get<0>(ExecutedCommands[0]) /*CommandType*/, + CommandType::HOST_TASK); + EXPECT_EQ(std::get<0>(ExecutedCommands[1]) /*CommandType*/, + CommandType::KERNEL); + EXPECT_EQ(std::get<1>(ExecutedCommands[1]) /*EventsCount*/, 0u); + EXPECT_EQ(std::get<2>(ExecutedCommands[1]) /*GlobalWorkSize*/, 32u); + EXPECT_EQ(std::get<0>(ExecutedCommands[2]) /*CommandType*/, + CommandType::KERNEL); + EXPECT_EQ(std::get<1>(ExecutedCommands[2]) /*EventsCount*/, 0u); + EXPECT_EQ(std::get<2>(ExecutedCommands[2]) /*GlobalWorkSize*/, 64u); }