diff --git a/sycl/source/detail/event_impl.hpp b/sycl/source/detail/event_impl.hpp index 86e9b26b98084..768de70826624 100644 --- a/sycl/source/detail/event_impl.hpp +++ b/sycl/source/detail/event_impl.hpp @@ -152,6 +152,9 @@ class event_impl { /// Clear the event state void setStateIncomplete(); + /// Set state as discarded. + void setStateDiscarded() { MState = HES_Discarded; } + /// Returns command that is associated with the event. /// /// Scheduler mutex must be locked in read mode when this is called. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index ab8348d3aacac..a6f1559cd8b2f 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -308,8 +308,9 @@ void queue_impl::addEvent(const event &Event) { addSharedEvent(Event); } // As long as the queue supports urQueueFinish we only need to store events - // for unenqueued commands and host tasks. - else if (MEmulateOOO || EImpl->getHandle() == nullptr) { + // for undiscarded, unenqueued commands and host tasks. + else if (MEmulateOOO || + (EImpl->getHandle() == nullptr && !EImpl->isDiscarded())) { std::weak_ptr EventWeakPtr{EImpl}; std::lock_guard Lock{MMutex}; MEventsWeak.push_back(std::move(EventWeakPtr)); @@ -412,13 +413,24 @@ event queue_impl::submit_impl(const std::function &CGF, template event queue_impl::submitWithHandler(const std::shared_ptr &Self, const std::vector &DepEvents, + bool CallerNeedsEvent, HandlerFuncT HandlerFunc) { - return submit( + SubmissionInfo SI{}; + if (!CallerNeedsEvent && supportsDiscardingPiEvents()) { + submit_without_event( + [&](handler &CGH) { + CGH.depends_on(DepEvents); + HandlerFunc(CGH); + }, + Self, SI, /*CodeLoc*/ {}, /*IsTopCodeLoc*/ true); + return createDiscardedEvent(); + } + return submit_with_event( [&](handler &CGH) { CGH.depends_on(DepEvents); HandlerFunc(CGH); }, - Self, /*CodeLoc*/ {}, /*SubmissionInfo*/ {}, /*IsTopCodeLoc*/ true); + Self, SI, /*CodeLoc*/ {}, /*IsTopCodeLoc*/ true); } template @@ -446,7 +458,16 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, NestedCallsTracker tracker; MemOpFunc(MemOpArgs..., getUrEvents(ExpandedDepEvents), /*PiEvent*/ nullptr, /*EventImplPtr*/ nullptr); - return createDiscardedEvent(); + + event DiscardedEvent = createDiscardedEvent(); + if (isInOrder()) { + // Store the discarded event for proper in-order dependency tracking. + auto &EventToStoreIn = MGraph.expired() + ? MDefaultGraphDeps.LastEventPtr + : MExtGraphDeps.LastEventPtr; + EventToStoreIn = detail::getSyclObjImpl(DiscardedEvent); + } + return DiscardedEvent; } event ResEvent = prepareSYCLEventAssociatedWithQueue(Self); @@ -471,7 +492,7 @@ event queue_impl::submitMemOpHelper(const std::shared_ptr &Self, return discard_or_return(ResEvent); } } - return submitWithHandler(Self, DepEvents, HandlerFunc); + return submitWithHandler(Self, DepEvents, CallerNeedsEvent, HandlerFunc); } void *queue_impl::instrumentationProlog(const detail::code_location &CodeLoc, diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 2daef04280c05..ee6b795211e6b 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -868,7 +868,7 @@ class queue_impl { template event submitWithHandler(const std::shared_ptr &Self, const std::vector &DepEvents, - HandlerFuncT HandlerFunc); + bool CallerNeedsEvent, HandlerFuncT HandlerFunc); /// Performs submission of a memory operation directly if scheduler can be /// bypassed, or with a handler otherwise. diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 5c42709930436..b56e75ab952e6 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -956,7 +956,7 @@ bool Command::enqueue(EnqueueResultT &EnqueueResult, BlockingT Blocking, EnqueueResultT(EnqueueResultT::SyclEnqueueFailed, this, Res); else { MEvent->setEnqueued(); - if (MShouldCompleteEventIfPossible && + if (MShouldCompleteEventIfPossible && !MEvent->isDiscarded() && (MEvent->isHost() || MEvent->getHandle() == nullptr)) MEvent->setComplete(); @@ -3055,6 +3055,13 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { ur_event_handle_t *Event = DiscardUrEvent ? nullptr : &UREvent; detail::EventImplPtr EventImpl = DiscardUrEvent ? nullptr : MEvent; + auto SetEventHandleOrDiscard = [&]() { + if (Event) + MEvent->setHandle(*Event); + else + MEvent->setStateDiscarded(); + }; + switch (MCommandGroup->getType()) { case CGType::UpdateHost: { @@ -3188,8 +3195,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::FillUSM: { @@ -3200,8 +3206,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::PrefetchUSM: { @@ -3212,8 +3217,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::AdviseUSM: { @@ -3225,8 +3229,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::Copy2DUSM: { @@ -3238,8 +3241,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::Fill2DUSM: { @@ -3251,8 +3253,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::Memset2DUSM: { @@ -3264,8 +3265,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::CodeplayHostTask: { @@ -3405,8 +3405,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { MQueue->getAdapter()->call( MQueue->getHandleRef(), InteropFreeFunc, &CustomOpData, ReqMems.size(), ReqMems.data(), nullptr, RawEvents.size(), RawEvents.data(), Event); - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::Barrier: { @@ -3416,8 +3415,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { MEvent->setHostEnqueueTime(); Adapter->call( MQueue->getHandleRef(), 0, nullptr, Event); - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::BarrierWaitlist: { @@ -3434,8 +3432,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { MEvent->setHostEnqueueTime(); Adapter->call( MQueue->getHandleRef(), UrEvents.size(), &UrEvents[0], Event); - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::ProfilingTag: { @@ -3482,8 +3479,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Adapter->call(PostTimestampBarrierEvent); } - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::CopyToDeviceGlobal: { @@ -3496,8 +3492,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::CopyFromDeviceGlobal: { @@ -3511,8 +3506,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } case CGType::ReadWriteHostPipe: { @@ -3543,8 +3537,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { CmdBufferCG->MCommandBuffer, MQueue->getHandleRef(), RawEvents.size(), RawEvents.empty() ? nullptr : &RawEvents[0], Event); - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return Err; } @@ -3560,8 +3553,7 @@ ur_result_t ExecCGCommand::enqueueImpQueue() { Result != UR_RESULT_SUCCESS) return Result; - if (Event) - MEvent->setHandle(*Event); + SetEventHandleOrDiscard(); return UR_RESULT_SUCCESS; } diff --git a/sycl/test-e2e/Regression/multi_thread_enqueue_discarded.cpp b/sycl/test-e2e/Regression/multi_thread_enqueue_discarded.cpp new file mode 100644 index 0000000000000..2d56dda5908e6 --- /dev/null +++ b/sycl/test-e2e/Regression/multi_thread_enqueue_discarded.cpp @@ -0,0 +1,37 @@ +// REQUIRES: aspect-usm_device_allocations +// RUN: %{build} %threads_lib -o %t.out +// RUN: %{run} %t.out + +// Regression test for a case where parallel work with enqueue functions +// discarding their results would cause implicit waits on discarded events. + +#include +#include +#include +#include +#include + +void threadFunction(int) { + sycl::queue Q{{sycl::property::queue::in_order()}}; + + constexpr int Size = 128 * 128 * 128; + int *DevMem = sycl::malloc_device(Size, Q); + + sycl::ext::oneapi::experimental::submit( + Q, [&](sycl::handler &cgh) { cgh.fill(DevMem, 1, Size); }); + Q.wait_and_throw(); + + sycl::free(DevMem, Q); +} + +int main() { + constexpr size_t NThreads = 2; + std::array Threads; + + for (size_t I = 0; I < NThreads; I++) + Threads[I] = std::thread{threadFunction, I}; + for (size_t I = 0; I < NThreads; I++) + Threads[I].join(); + + return 0; +} diff --git a/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp index 9b92c850c1f86..7c9b682f4e5c4 100644 --- a/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp +++ b/sycl/unittests/Extensions/EnqueueFunctionsEvents.cpp @@ -7,6 +7,8 @@ //===----------------------------------------------------------------------===// // Tests the behavior of enqueue free functions when events can be discarded. +#include "detail/event_impl.hpp" +#include "detail/queue_impl.hpp" #include "sycl/platform.hpp" #include #include @@ -107,6 +109,13 @@ class EnqueueFunctionsEventsTests : public ::testing::Test { queue Q; }; +inline void CheckLastEventDiscarded(sycl::queue &Q) { + auto QueueImplPtr = sycl::detail::getSyclObjImpl(Q); + event LastEvent = QueueImplPtr->getLastEvent(); + auto LastEventImplPtr = sycl::detail::getSyclObjImpl(LastEvent); + ASSERT_TRUE(LastEventImplPtr->isDiscarded()); +} + TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskNoEvent) { mock::getCallbacks().set_replace_callback("urEnqueueKernelLaunch", &redefined_urEnqueueKernelLaunch); @@ -116,6 +125,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskNoEvent) { }); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutNoEvent) { @@ -125,6 +136,8 @@ TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutNoEvent) { oneapiext::single_task>(Q, []() {}); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskKernelNoEvent) { @@ -144,6 +157,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitSingleTaskKernelNoEvent) { [&](handler &CGH) { oneapiext::single_task(CGH, Kernel); }); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutKernelNoEvent) { @@ -163,6 +178,8 @@ TEST_F(EnqueueFunctionsEventsTests, SingleTaskShortcutKernelNoEvent) { oneapiext::single_task(Q, Kernel); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForNoEvent) { @@ -174,6 +191,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForNoEvent) { }); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutNoEvent) { @@ -183,6 +202,8 @@ TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutNoEvent) { oneapiext::parallel_for>(Q, range<1>{32}, [](item<1>) {}); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForKernelNoEvent) { @@ -203,6 +224,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitRangeParallelForKernelNoEvent) { }); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutKernelNoEvent) { @@ -222,6 +245,8 @@ TEST_F(EnqueueFunctionsEventsTests, RangeParallelForShortcutKernelNoEvent) { oneapiext::parallel_for(Q, range<1>{32}, Kernel); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchNoEvent) { @@ -234,6 +259,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchNoEvent) { }); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutNoEvent) { @@ -244,6 +271,8 @@ TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutNoEvent) { [](nd_item<1>) {}); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchKernelNoEvent) { @@ -264,6 +293,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitNDLaunchKernelNoEvent) { }); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutKernelNoEvent) { @@ -283,6 +314,8 @@ TEST_F(EnqueueFunctionsEventsTests, NDLaunchShortcutKernelNoEvent) { oneapiext::nd_launch(Q, nd_range<1>{range<1>{32}, range<1>{32}}, Kernel); ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + CheckLastEventDiscarded(Q); } TEST_F(EnqueueFunctionsEventsTests, SubmitMemcpyNoEvent) { @@ -299,6 +332,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitMemcpyNoEvent) { ASSERT_EQ(counter_urUSMEnqueueMemcpy, size_t{1}); + CheckLastEventDiscarded(Q); + free(Src, Q); free(Dst, Q); } @@ -315,6 +350,8 @@ TEST_F(EnqueueFunctionsEventsTests, MemcpyShortcutNoEvent) { ASSERT_EQ(counter_urUSMEnqueueMemcpy, size_t{1}); + CheckLastEventDiscarded(Q); + free(Src, Q); free(Dst, Q); } @@ -332,6 +369,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitCopyNoEvent) { ASSERT_EQ(counter_urUSMEnqueueMemcpy, size_t{1}); + CheckLastEventDiscarded(Q); + free(Src, Q); free(Dst, Q); } @@ -348,6 +387,8 @@ TEST_F(EnqueueFunctionsEventsTests, CopyShortcutNoEvent) { ASSERT_EQ(counter_urUSMEnqueueMemcpy, size_t{1}); + CheckLastEventDiscarded(Q); + free(Src, Q); free(Dst, Q); } @@ -365,6 +406,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitMemsetNoEvent) { ASSERT_EQ(counter_urUSMEnqueueFill, size_t{1}); + CheckLastEventDiscarded(Q); + free(Dst, Q); } @@ -379,6 +422,8 @@ TEST_F(EnqueueFunctionsEventsTests, MemsetShortcutNoEvent) { ASSERT_EQ(counter_urUSMEnqueueFill, size_t{1}); + CheckLastEventDiscarded(Q); + free(Dst, Q); } @@ -394,6 +439,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitPrefetchNoEvent) { ASSERT_EQ(counter_urUSMEnqueuePrefetch, size_t{1}); + CheckLastEventDiscarded(Q); + free(Dst, Q); } @@ -408,6 +455,8 @@ TEST_F(EnqueueFunctionsEventsTests, PrefetchShortcutNoEvent) { ASSERT_EQ(counter_urUSMEnqueuePrefetch, size_t{1}); + CheckLastEventDiscarded(Q); + free(Dst, Q); } @@ -424,6 +473,8 @@ TEST_F(EnqueueFunctionsEventsTests, SubmitMemAdviseNoEvent) { ASSERT_EQ(counter_urUSMEnqueueMemAdvise, size_t{1}); + CheckLastEventDiscarded(Q); + free(Dst, Q); } @@ -438,6 +489,8 @@ TEST_F(EnqueueFunctionsEventsTests, MemAdviseShortcutNoEvent) { ASSERT_EQ(counter_urUSMEnqueueMemAdvise, size_t{1}); + CheckLastEventDiscarded(Q); + free(Dst, Q); }