diff --git a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp index ffa071f209580..864ea780083fe 100644 --- a/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp +++ b/sycl/include/sycl/ext/oneapi/experimental/enqueue_functions.hpp @@ -450,7 +450,8 @@ inline void execute_graph(handler &CGH, inline void execute_graph(queue Q, command_graph &G, const sycl::detail::code_location &CodeLoc = sycl::detail::code_location::current()) { - submit(std::move(Q), [&](handler &CGH) { execute_graph(CGH, G); }, CodeLoc); + submit_graph_direct_without_event_impl(std::move(Q), G, /*DepEvents*/ {}, + CodeLoc); } } // namespace ext::oneapi::experimental diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 0b348ec7ff256..492c971bdc63e 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -81,6 +81,20 @@ void __SYCL_EXPORT submit_kernel_direct_without_event_impl( const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); +event __SYCL_EXPORT submit_graph_direct_with_event_impl( + const queue &Queue, + ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &G, + sycl::span DepEvents, + const detail::code_location &CodeLoc = detail::code_location::current()); + +void __SYCL_EXPORT submit_graph_direct_without_event_impl( + const queue &Queue, + ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &G, + sycl::span DepEvents, + const detail::code_location &CodeLoc = detail::code_location::current()); + namespace detail { class queue_impl; @@ -3706,7 +3720,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { ext::oneapi::experimental::graph_state::executable> Graph, const detail::code_location &CodeLoc = detail::code_location::current()) { - return submit([&](handler &CGH) { CGH.ext_oneapi_graph(Graph); }, CodeLoc); + return submit_graph_direct_with_event_impl(*this, Graph, /*DepEvents*/ {}, + CodeLoc); } /// Shortcut for executing a graph of commands with a single dependency. @@ -3721,12 +3736,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { Graph, event DepEvent, const detail::code_location &CodeLoc = detail::code_location::current()) { - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvent); - CGH.ext_oneapi_graph(Graph); - }, - CodeLoc); + return submit_graph_direct_with_event_impl( + *this, Graph, sycl::span(&DepEvent, 1), CodeLoc); } /// Shortcut for executing a graph of commands with multiple dependencies. @@ -3741,12 +3752,8 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { Graph, const std::vector &DepEvents, const detail::code_location &CodeLoc = detail::code_location::current()) { - return submit( - [&](handler &CGH) { - CGH.depends_on(DepEvents); - CGH.ext_oneapi_graph(Graph); - }, - CodeLoc); + return submit_graph_direct_with_event_impl(*this, Graph, DepEvents, + CodeLoc); } /// Provides a hint to the runtime that previously issued commands to this diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 0f03ddb6b00a4..9d995bf4935c1 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -1204,7 +1204,7 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, return SignalEvent; } -EventImplPtr +std::pair exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded) { @@ -1213,19 +1213,17 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, cleanupExecutionEvents(MSchedulerDependencies); CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), MSchedulerDependencies.end()); - bool IsCGDataSafeForSchedulerBypass = detail::Scheduler::areEventsSafeForSchedulerBypass( CGData.MEvents, Queue.getContextImpl()) && CGData.MRequirements.empty(); + bool SkipScheduler = IsCGDataSafeForSchedulerBypass && !MContainsHostTask; // This variable represents the returned event. It will always be nullptr if // EventNeeded is false. EventImplPtr SignalEvent; - if (!MContainsHostTask) { - bool SkipScheduler = - IsCGDataSafeForSchedulerBypass && MPartitions[0]->MRequirements.empty(); + SkipScheduler = SkipScheduler && MPartitions[0]->MRequirements.empty(); if (SkipScheduler) { SignalEvent = enqueuePartitionDirectly(MPartitions[0], Queue, CGData.MEvents, EventNeeded); @@ -1258,7 +1256,7 @@ exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, SignalEvent->setProfilingEnabled(MEnableProfiling); } - return SignalEvent; + return {SignalEvent, SkipScheduler}; } void exec_graph_impl::duplicateNodes() { diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp index 10cbbfab0282c..72e06a6b5bfdc 100644 --- a/sycl/source/detail/graph/graph_impl.hpp +++ b/sycl/source/detail/graph/graph_impl.hpp @@ -640,11 +640,14 @@ class exec_graph_impl { /// @param CGData Command-group data provided by the sycl::handler /// @param EventNeeded Whether an event signalling the completion of this /// operation needs to be returned. - /// @return Returns an event if EventNeeded is true. Returns nullptr - /// otherwise. - EventImplPtr enqueue(sycl::detail::queue_impl &Queue, - sycl::detail::CG::StorageInitHelper CGData, - bool EventNeeded); + /// @return Returns a pair of an event and a boolean indicating whether the + /// scheduler was bypassed. If an event is required, then the first element of + /// the pair is the event representing the execution of the graph. If no event + /// is required, the first element is nullptr. The second element is true if + /// the scheduler was bypassed, false otherwise. + std::pair + enqueue(sycl::detail::queue_impl &Queue, + sycl::detail::CG::StorageInitHelper CGData, bool EventNeeded); /// Iterates through all the nodes in the graph to build the list of /// accessor requirements for the whole graph and for each partition. diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 7f38c9266a37e..cedfa623082e1 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -326,6 +326,16 @@ void queue_impl::addEvent(const detail::EventImplPtr &EventImpl) { } } +void queue_impl::addEventUnlocked(const detail::EventImplPtr &EventImpl) { + if (!EventImpl) + return; + Command *Cmd = EventImpl->getCommand(); + if (Cmd != nullptr && EventImpl->getHandle() == nullptr) { + std::weak_ptr EventWeakPtr{EventImpl}; + MEventsWeak.push_back(std::move(EventWeakPtr)); + } +} + detail::EventImplPtr queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, bool CallerNeedsEvent, const detail::code_location &Loc, @@ -574,16 +584,23 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( KData.validateAndSetKernelLaunchProperties(Props, hasCommandGraph(), getDeviceImpl()); - auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, - bool SchedulerBypass) -> EventImplPtr { + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData) + -> std::pair { + bool SchedulerBypass = + (CGData.MEvents.size() > 0 + ? detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, getContextImpl()) + : true) && + !hasCommandGraph(); if (SchedulerBypass) { // No need to copy/move the kernel function, so we set // the function pointer to the original function KData.setKernelFunc(HostKernel.getPtr()); - return submit_kernel_scheduler_bypass(KData, CGData.MEvents, - CallerNeedsEvent, nullptr, nullptr, - CodeLoc, IsTopCodeLoc); + return {submit_kernel_scheduler_bypass(KData, CGData.MEvents, + CallerNeedsEvent, nullptr, nullptr, + CodeLoc, IsTopCodeLoc), + SchedulerBypass}; } std::unique_ptr CommandGroup; std::vector> StreamStorage; @@ -611,24 +628,63 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; if (auto GraphImpl = getCommandGraph(); GraphImpl) { - return submit_command_to_graph(*GraphImpl, std::move(CommandGroup), - detail::CGType::Kernel); + return {submit_command_to_graph(*GraphImpl, std::move(CommandGroup), + detail::CGType::Kernel), + SchedulerBypass}; } - return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), - *this, true); + return {detail::Scheduler::getInstance().addCG(std::move(CommandGroup), + *this, true), + SchedulerBypass}; }; - return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc); + return submit_direct(CallerNeedsEvent, DepEvents, SubmitKernelFunc, + detail::CGType::Kernel, + /*CommandFuncContainsHostTask*/ false); +} + +EventImplPtr queue_impl::submit_graph_direct_impl( + std::shared_ptr + ExecGraph, + bool CallerNeedsEvent, sycl::span DepEvents, + [[maybe_unused]] const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + bool EventNeeded = CallerNeedsEvent || ExecGraph->containsHostTask() || + !supportsDiscardingPiEvents(); + auto SubmitGraphFunc = [&](detail::CG::StorageInitHelper CGData) + -> std::pair { + if (auto ParentGraph = getCommandGraph(); ParentGraph) { + std::unique_ptr CommandGroup; + { + ext::oneapi::experimental::detail::graph_impl::ReadLock ExecLock( + ExecGraph->MMutex); + CGData.MRequirements = ExecGraph->getRequirements(); + } + // Here we are using the CommandGroup without passing a CommandBuffer to + // pass the exec_graph_impl and event dependencies. Since this subgraph + // CG will not be executed this is fine. + CommandGroup.reset( + new sycl::detail::CGExecCommandBuffer(nullptr, ExecGraph, CGData)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + return {submit_command_to_graph(*ParentGraph, std::move(CommandGroup), + detail::CGType::ExecCommandBuffer), + /*SchedulerBypass*/ false}; + } else { + return ExecGraph->enqueue(*this, CGData, EventNeeded); + } + }; + return submit_direct(CallerNeedsEvent, DepEvents, SubmitGraphFunc, + detail::CGType::ExecCommandBuffer, + ExecGraph->containsHostTask()); } template -detail::EventImplPtr -queue_impl::submit_direct(bool CallerNeedsEvent, - sycl::span DepEvents, - SubmitCommandFuncType &SubmitCommandFunc) { +detail::EventImplPtr queue_impl::submit_direct( + bool CallerNeedsEvent, sycl::span DepEvents, + SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type, + bool CommandFuncContainsHostTask) { detail::CG::StorageInitHelper CGData; std::unique_lock Lock(MMutex); + const bool inOrder = isInOrder(); // Used by queue_empty() and getLastEvent() MEmpty.store(false, std::memory_order_release); @@ -639,29 +695,35 @@ queue_impl::submit_direct(bool CallerNeedsEvent, registerEventDependency( getSyclObjImpl(*ExternalEvent), CGData.MEvents, this, getContextImpl(), getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr, - detail::CGType::Kernel); + Type); } auto &Deps = hasCommandGraph() ? MExtGraphDeps : MDefaultGraphDeps; // Sync with the last event for in order queue EventImplPtr &LastEvent = Deps.LastEventPtr; - if (isInOrder() && LastEvent) { + if (inOrder && LastEvent) { registerEventDependency( LastEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(), - hasCommandGraph() ? getCommandGraph().get() : nullptr, - detail::CGType::Kernel); + hasCommandGraph() ? getCommandGraph().get() : nullptr, Type); + } else if (inOrder && MNoLastEventMode && CommandFuncContainsHostTask) { + // If we have a host task in an in-order queue with no last event mode, then + // we must add a barrier to ensure ordering. + auto ResEvent = insertHelperBarrier(); + registerEventDependency( + ResEvent, CGData.MEvents, this, getContextImpl(), getDeviceImpl(), + hasCommandGraph() ? getCommandGraph().get() : nullptr, Type); } for (event e : DepEvents) { registerEventDependency( getSyclObjImpl(e), CGData.MEvents, this, getContextImpl(), getDeviceImpl(), hasCommandGraph() ? getCommandGraph().get() : nullptr, - detail::CGType::Kernel); + Type); } // Barrier and un-enqueued commands synchronization for out or order queue - if (!isInOrder()) { + if (!inOrder) { MMissedCleanupRequests.unset( [&](MissedCleanupRequestsType &MissedCleanupRequests) { for (auto &UpdatedGraph : MissedCleanupRequests) @@ -674,31 +736,27 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } } - bool SchedulerBypass = - (CGData.MEvents.size() > 0 - ? detail::Scheduler::areEventsSafeForSchedulerBypass( - CGData.MEvents, getContextImpl()) - : true) && - !hasCommandGraph(); + auto [EventImpl, SchedulerBypass] = SubmitCommandFunc(CGData); // Synchronize with the "no last event mode", used by the handler-based // kernel submit path - MNoLastEventMode.store(isInOrder() && SchedulerBypass, - std::memory_order_relaxed); - - EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass); + MNoLastEventMode.store(inOrder && SchedulerBypass, std::memory_order_relaxed); // Sync with the last event for in order queue. For scheduler-bypass flow, // the ordering is done at the layers below the SYCL runtime, // but for the scheduler-based flow, it needs to be done here, as the // scheduler handles host task submissions. - if (isInOrder()) { + if (inOrder) { LastEvent = SchedulerBypass ? nullptr : EventImpl; } - // Barrier and un-enqueued commands synchronization for out or order queue - if (!isInOrder() && !EventImpl->isEnqueued()) { - Deps.UnenqueuedCmdEvents.push_back(EventImpl); + // Barrier and un-enqueued commands synchronization for out or order queue. + // The event must also be stored for future wait calls. + if (!inOrder) { + if (!EventImpl->isEnqueued()) { + Deps.UnenqueuedCmdEvents.push_back(EventImpl); + } + addEventUnlocked(EventImpl); } return CallerNeedsEvent ? EventImpl : nullptr; @@ -1149,6 +1207,15 @@ void queue_impl::verifyProps(const property_list &Props) const { CheckPropertiesWithData); } +EventImplPtr queue_impl::insertHelperBarrier() { + auto ResEvent = detail::event_impl::create_device_event(*this); + ur_event_handle_t UREvent = nullptr; + getAdapter().call( + getHandleRef(), 0, nullptr, &UREvent); + ResEvent->setHandle(UREvent); + return ResEvent; +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 4a2d9a28fd8cc..6b9d82d502dcd 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -385,6 +385,24 @@ class queue_impl : public std::enable_shared_from_this { CodeLoc, IsTopCodeLoc); } + void submit_graph_direct_without_event( + std::shared_ptr + ExecGraph, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + submit_graph_direct_impl(ExecGraph, false, DepEvents, CodeLoc, + IsTopCodeLoc); + } + + event submit_graph_direct_with_event( + std::shared_ptr + ExecGraph, + sycl::span DepEvents, const detail::code_location &CodeLoc, + bool IsTopCodeLoc) { + return createSyclObjFromImpl(submit_graph_direct_impl( + ExecGraph, true, DepEvents, CodeLoc, IsTopCodeLoc)); + } + void submit_without_event(const detail::type_erased_cgfo_ty &CGF, const v1::SubmissionInfo &SubmitInfo, const detail::code_location &Loc, @@ -706,15 +724,12 @@ class queue_impl : public std::enable_shared_from_this { } protected: + EventImplPtr insertHelperBarrier(); + template EventImplPtr insertHelperBarrier(const HandlerType &Handler) { queue_impl &Queue = Handler.impl->get_queue(); - auto ResEvent = detail::event_impl::create_device_event(Queue); - ur_event_handle_t UREvent = nullptr; - getAdapter().call( - Queue.getHandleRef(), 0, nullptr, &UREvent); - ResEvent->setHandle(UREvent); - return ResEvent; + return Queue.insertHelperBarrier(); } template @@ -910,6 +925,7 @@ class queue_impl : public std::enable_shared_from_this { /// \param DeviceKernelInfo is a structure aggregating kernel related data /// \param CallerNeedsEvent is a boolean indicating whether the event is /// required by the user after the call. + /// \param DepEvents is a vector of dependencies of the operation. /// \param CodeLoc is the code location of the submit call /// \param IsTopCodeLoc Used to determine if the object is in a local /// scope or in the top level scope. @@ -922,10 +938,28 @@ class queue_impl : public std::enable_shared_from_this { const detail::KernelPropertyHolderStructTy &Props, const detail::code_location &CodeLoc, bool IsTopCodeLoc); + /// Performs graph submission to the queue. + /// + /// \param ExecGraph is an executable graph + /// \param CallerNeedsEvent is a boolean indicating whether the event is + /// required by the user after the call. + /// \param DepEvents is a vector of dependencies of the operation. + /// \param CodeLoc is the code location of the submit call + /// \param IsTopCodeLoc Used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command group or nullptr. + EventImplPtr submit_graph_direct_impl( + std::shared_ptr + ExecGraph, + bool CallerNeedsEvent, sycl::span DepEvents, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + template - EventImplPtr submit_direct(bool CallerNeedsEvent, - sycl::span DepEvents, - SubmitCommandFuncType &SubmitCommandFunc); + EventImplPtr + submit_direct(bool CallerNeedsEvent, sycl::span DepEvents, + SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type, + bool CommandFuncContainsHostTask); /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. @@ -986,6 +1020,12 @@ class queue_impl : public std::enable_shared_from_this { /// \param EventImpl is the event to be stored void addEvent(const detail::EventImplPtr &EventImpl); + /// Stores an event that should be associated with the queue with + /// the queue lock already acquired by caller. + /// + /// \param EventImpl is the event to be stored + void addEventUnlocked(const detail::EventImplPtr &EventImpl); + /// Protects all the fields that can be changed by class' methods. mutable std::mutex MMutex; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index fbcd88f1bd42a..0d450546a56c8 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -800,7 +800,7 @@ event handler::finalize() { bool DiscardEvent = !impl->MEventNeeded && Queue.supportsDiscardingPiEvents() && !impl->MExecGraph->containsHostTask(); - detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue( + auto [GraphCompletionEvent, Unused] = impl->MExecGraph->enqueue( Queue, std::move(impl->CGData), !DiscardEvent); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES return GraphCompletionEvent; diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index f5858217d23e7..888832955a9d1 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -487,6 +487,28 @@ void submit_kernel_direct_without_event_impl( IsTopCodeLoc); } +event submit_graph_direct_with_event_impl( + const queue &Queue, + ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &G, + sycl::span DepEvents, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + return getSyclObjImpl(Queue)->submit_graph_direct_with_event( + getSyclObjImpl(G), DepEvents, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); +} + +void submit_graph_direct_without_event_impl( + const queue &Queue, + ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &G, + sycl::span DepEvents, const detail::code_location &CodeLoc) { + detail::tls_code_loc_t TlsCodeLocCapture(CodeLoc); + getSyclObjImpl(Queue)->submit_graph_direct_without_event( + getSyclObjImpl(G), DepEvents, TlsCodeLocCapture.query(), + TlsCodeLocCapture.isToplevel()); +} + } // namespace _V1 } // namespace sycl diff --git a/sycl/test-e2e/Graph/RecordReplay/host_task_in_order_dependency.cpp b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order_dependency.cpp new file mode 100644 index 0000000000000..55ab30fcafd3c --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/host_task_in_order_dependency.cpp @@ -0,0 +1,54 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// +// XFAIL: level_zero && windows && gpu-intel-gen12 +// XFAIL-TRACKER: https://github.com/intel/llvm/issues/20696 +// +// REQUIRES: aspect-usm_host_allocations + +// Tests injected barrier between an in-order operation in no event mode and a +// graph consisting of a single host_task. Test attempts to produce a race +// condition if the barrier is not correctly injected. + +#include "../graph_common.hpp" + +#include +#include + +int main() { + constexpr int KernelValue = 1; + constexpr int HostTaskValue = 7; + sycl::queue Q{sycl::property::queue::in_order{}}; + + int *HostUSM = sycl::malloc_host(1, Q); + + // Record graph with a single host_task that overwrites the value. + sycl::ext::oneapi::experimental::command_graph Graph{Q.get_context(), + Q.get_device()}; + Graph.begin_recording(Q); + Q.submit([&](sycl::handler &H) { + H.host_task([=]() { *HostUSM = HostTaskValue; }); + }); + Graph.end_recording(Q); + auto ExecGraph = Graph.finalize(); + + exp_ext::single_task(Q, [=]() { + // Empirically determined to trigger race condition when + // barrier is removed. + int SpinIters = 500; + for (volatile int i = 0; i < SpinIters; i += 1) { + *HostUSM = KernelValue; + } + }); + + // Due to in-order queue, implicit dependency on prior event. Scheduler should + // inject barrier. + exp_ext::execute_graph(Q, ExecGraph); + Q.wait_and_throw(); + int ActualValue = *HostUSM; + assert(check_value(0, HostTaskValue, ActualValue, "HostUSM")); + sycl::free(HostUSM, Q); + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_handlerless_deps.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_handlerless_deps.cpp new file mode 100644 index 0000000000000..1e82672c67f52 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_handlerless_deps.cpp @@ -0,0 +1,79 @@ +// RUN: %{build} -o %t.out +// RUN: %{run} %t.out +// Extra run to check for leaks in Level Zero using UR_L0_LEAKS_DEBUG +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} +// + +// Tests adding a sub-graph to an out-of-order queue using the handler-less +// path with event dependencies + +#include "../graph_common.hpp" + +int main() { + queue Queue; + + exp_ext::command_graph Graph{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph1{Queue.get_context(), Queue.get_device()}; + exp_ext::command_graph SubGraph2{Queue.get_context(), Queue.get_device()}; + + const size_t N = 10; + int *X = malloc_device(N, Queue); + int *Y = malloc_device(N, Queue); + + SubGraph1.begin_recording(Queue); + { + auto Event = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { X[it] *= 2; }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.parallel_for(N, [=](id<1> it) { X[it] += 1; }); + }); + } + SubGraph1.end_recording(Queue); + auto ExecSubGraph1 = SubGraph1.finalize(); + + SubGraph2.begin_recording(Queue); + { + auto Event = Queue.submit([&](handler &CGH) { + CGH.parallel_for(N, [=](id<1> it) { Y[it] += X[it]; }); + }); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(Event); + CGH.parallel_for(N, [=](id<1> it) { Y[it] *= 3; }); + }); + } + SubGraph2.end_recording(Queue); + auto ExecSubGraph2 = SubGraph2.finalize(); + + Graph.begin_recording(Queue); + auto Event1 = Queue.submit( + [&](handler &CGH) { CGH.parallel_for(N, [=](id<1> it) { X[it] = 1; }); }); + auto Event2 = Queue.ext_oneapi_graph(ExecSubGraph1, Event1); + auto Event3 = Queue.submit( + [&](handler &CGH) { CGH.parallel_for(N, [=](id<1> it) { Y[it] = 1; }); }); + auto Event4 = Queue.ext_oneapi_graph(ExecSubGraph2, {Event2, Event3}); + + Queue.submit([&](handler &CGH) { + CGH.depends_on(Event4); + CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] += 3; }); + }); + Graph.end_recording(); + auto ExecGraph = Graph.finalize(); + + Queue.ext_oneapi_graph(ExecGraph).wait(); + + int OutputX, OutputY; + Queue.memcpy(&OutputX, X, sizeof(int)).wait(); + Queue.memcpy(&OutputY, Y, sizeof(int)).wait(); + + assert(OutputX == 6); + assert(OutputY == 12); + + sycl::free(X, Queue); + sycl::free(Y, Queue); + + return 0; +} diff --git a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp index 946bfe29c8ca5..919bf82eec6a3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/sub_graph_in_order.cpp @@ -8,6 +8,7 @@ #include "../graph_common.hpp" +#include #include int main() { @@ -39,7 +40,10 @@ int main() { Queue.submit( [&](handler &CGH) { CGH.parallel_for(N, [=](id<1> it) { X[it] = 1; }); }); - Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecSubGraph); }); + // Test all submission paths for graph + exp_ext::execute_graph(Queue, ExecSubGraph); + Queue.ext_oneapi_graph(ExecSubGraph); + Queue.submit([&](sycl::handler &CGH) { CGH.ext_oneapi_graph(ExecSubGraph); }); Queue.submit([&](handler &CGH) { CGH.parallel_for(range<1>{N}, [=](id<1> it) { X[it] += 3; }); @@ -54,7 +58,7 @@ int main() { int Output; Queue.memcpy(&Output, X, sizeof(int)).wait(); - assert(Output == 6); + assert(Output == 18); sycl::free(X, Queue); diff --git a/sycl/test/abi/sycl_symbols_linux.dump b/sycl/test/abi/sycl_symbols_linux.dump index 5c8c467b48caa..d2223fce9535b 100644 --- a/sycl/test/abi/sycl_symbols_linux.dump +++ b/sycl/test/abi/sycl_symbols_linux.dump @@ -2985,7 +2985,9 @@ _ZN4sycl3_V121__isgreaterequal_implEdd _ZN4sycl3_V121__isgreaterequal_implEff _ZN4sycl3_V122accelerator_selector_vERKNS0_6deviceE _ZN4sycl3_V128verifyUSMAllocatorPropertiesERKNS0_13property_listE +_ZN4sycl3_V135submit_graph_direct_with_event_implERKNS0_5queueERNS0_3ext6oneapi12experimental13command_graphILNS6_11graph_stateE1EEENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS0_6detail13code_locationE _ZN4sycl3_V136submit_kernel_direct_with_event_implERKNS0_5queueERKNS0_6detail16nd_range_view_v113nd_range_viewERNS4_17HostKernelRefBaseEPNS4_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS4_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSJ_5intel12experimental12cache_configENSL_17use_root_sync_keyENSL_23work_group_progress_keyENSL_22sub_group_progress_keyENSL_22work_item_progress_keyENSL_4cuda12cluster_sizeILi1EEENSV_ILi2EEENSV_ILi3EEEEEERKNS4_13code_locationEb +_ZN4sycl3_V138submit_graph_direct_without_event_implERKNS0_5queueERNS0_3ext6oneapi12experimental13command_graphILNS6_11graph_stateE1EEENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS0_6detail13code_locationE _ZN4sycl3_V139submit_kernel_direct_without_event_implERKNS0_5queueERKNS0_6detail16nd_range_view_v113nd_range_viewERNS4_17HostKernelRefBaseEPNS4_16DeviceKernelInfoENS0_4spanIKNS0_5eventELm18446744073709551615EEERKNS4_27kernel_launch_properties_v111PropsHolderIJNS0_3ext6oneapi12experimental23work_group_scratch_sizeENSJ_5intel12experimental12cache_configENSL_17use_root_sync_keyENSL_23work_group_progress_keyENSL_22sub_group_progress_keyENSL_22work_item_progress_keyENSL_4cuda12cluster_sizeILi1EEENSV_ILi2EEENSV_ILi3EEEEEERKNS4_13code_locationEb _ZN4sycl3_V13ext5intel12experimental9pipe_base13get_pipe_nameB5cxx11EPKv _ZN4sycl3_V13ext5intel12experimental9pipe_base17wait_non_blockingERKNS0_5eventE diff --git a/sycl/test/abi/sycl_symbols_windows.dump b/sycl/test/abi/sycl_symbols_windows.dump index f83f1f8c25512..94142bcf7bcb1 100644 --- a/sycl/test/abi/sycl_symbols_windows.dump +++ b/sycl/test/abi/sycl_symbols_windows.dump @@ -4474,6 +4474,8 @@ ?storeRawArg@handler@_V1@sycl@@AEAAPEAXAEBVraw_kernel_arg@experimental@oneapi@ext@23@@Z ?storeRawArg@handler@_V1@sycl@@AEAAPEAXPEBX_K@Z ?stringifyErrorCode@detail@_V1@sycl@@YAPEBDH@Z +?submit_graph_direct_with_event_impl@_V1@sycl@@YA?AVevent@12@AEBVqueue@12@AEAV?$command_graph@$00@experimental@oneapi@ext@12@V?$span@$$CBVevent@_V1@sycl@@$0?0@12@AEBUcode_location@detail@12@@Z +?submit_graph_direct_without_event_impl@_V1@sycl@@YAXAEBVqueue@12@AEAV?$command_graph@$00@experimental@oneapi@ext@12@V?$span@$$CBVevent@_V1@sycl@@$0?0@12@AEBUcode_location@detail@12@@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@AEBUcode_location@detail@23@_N@Z ?submit_impl@queue@_V1@sycl@@AEAA?AVevent@23@V?$function@$$A6AXAEAVhandler@_V1@sycl@@@Z@std@@V123@AEBUcode_location@detail@23@@Z