diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 0185c611bec59..ecce4e58ab660 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -2839,10 +2839,15 @@ class __SYCL_EXPORT handler { /// Executes a command_graph. /// /// \param Graph Executable command_graph to run +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES + void + ext_oneapi_graph(const ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &Graph); +#else void ext_oneapi_graph(ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::executable> Graph); - +#endif /// Copies data from host to device, where \p Src is a USM pointer and \p Dest /// is an opaque image memory handle. An exception is thrown if either \p Src /// is nullptr or \p Dest is incomplete. The behavior is undefined if diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 968100d2d9ea4..514ad88465419 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -835,7 +835,7 @@ void exec_graph_impl::createCommandBuffers( throw sycl::exception(errc::invalid, "Failed to create UR command-buffer"); } - Partition->MCommandBuffers[Device] = OutCommandBuffer; + Partition->MCommandBuffer = OutCommandBuffer; for (node_impl &Node : Partition->schedule()) { // Some nodes are not scheduled like other nodes, and only their @@ -910,13 +910,12 @@ exec_graph_impl::~exec_graph_impl() { for (const auto &Partition : MPartitions) { Partition->MSchedule.clear(); - for (const auto &Iter : Partition->MCommandBuffers) { - if (auto CmdBuf = Iter.second; CmdBuf) { - ur_result_t Res = Adapter.call_nocheck< - sycl::detail::UrApiKind::urCommandBufferReleaseExp>(CmdBuf); - (void)Res; - assert(Res == UR_RESULT_SUCCESS); - } + if (Partition->MCommandBuffer) { + ur_result_t Res = Adapter.call_nocheck< + sycl::detail::UrApiKind::urCommandBufferReleaseExp>( + Partition->MCommandBuffer); + (void)Res; + assert(Res == UR_RESULT_SUCCESS); } } } catch (std::exception &e) { @@ -988,11 +987,9 @@ EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler( Partition->MAccessors.end()); } - auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; - std::unique_ptr CommandGroup = std::make_unique( - CommandBuffer, nullptr, std::move(CGData)); + Partition->MCommandBuffer, nullptr, std::move(CGData)); EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( std::move(CommandGroup), Queue, EventNeeded); @@ -1009,25 +1006,28 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, std::vector &WaitEvents, bool EventNeeded) { + ur_event_handle_t *UrEnqueueWaitList = nullptr; + size_t UrEnqueueWaitListSize = 0; + // Create a list containing all the UR event handles in WaitEvents. WaitEvents // is assumed to be safe for scheduler bypass and any host-task events that it // contains can be ignored. std::vector UrEventHandles{}; - UrEventHandles.reserve(WaitEvents.size()); - for (auto &SyclWaitEvent : WaitEvents) { - if (auto URHandle = SyclWaitEvent->getHandle()) { - UrEventHandles.push_back(URHandle); + if (!WaitEvents.empty()) { + UrEventHandles.reserve(WaitEvents.size()); + for (auto &SyclWaitEvent : WaitEvents) { + if (auto URHandle = SyclWaitEvent->getHandle()) { + UrEventHandles.push_back(URHandle); + } } - } - auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; - const size_t UrEnqueueWaitListSize = UrEventHandles.size(); - const ur_event_handle_t *UrEnqueueWaitList = - UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data(); + UrEnqueueWaitList = UrEventHandles.data(); + UrEnqueueWaitListSize = UrEventHandles.size(); + } if (!EventNeeded) { Queue.getAdapter().call( - Queue.getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, + Queue.getHandleRef(), Partition->MCommandBuffer, UrEnqueueWaitListSize, UrEnqueueWaitList, nullptr); return nullptr; } else { @@ -1037,7 +1037,7 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( NewEvent->setSubmissionTime(); ur_event_handle_t UrEvent = nullptr; Queue.getAdapter().call( - Queue.getHandleRef(), CommandBuffer, UrEventHandles.size(), + Queue.getHandleRef(), Partition->MCommandBuffer, UrEventHandles.size(), UrEnqueueWaitList, &UrEvent); NewEvent->setHandle(UrEvent); NewEvent->setEventFromSubmittedExecCommandBuffer(true); @@ -1161,17 +1161,20 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, EventImplPtr exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, - sycl::detail::CG::StorageInitHelper CGData, + sycl::detail::CG::StorageInitHelper &CGData, bool EventNeeded) { WriteLock Lock(MMutex); - cleanupExecutionEvents(MSchedulerDependencies); - CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), - MSchedulerDependencies.end()); + if (!MSchedulerDependencies.empty()) { + cleanupExecutionEvents(MSchedulerDependencies); + CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), + MSchedulerDependencies.end()); + } bool IsCGDataSafeForSchedulerBypass = - detail::Scheduler::areEventsSafeForSchedulerBypass( - CGData.MEvents, Queue.getContextImpl()) && + (CGData.MEvents.empty() || + detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, Queue.getContextImpl())) && CGData.MRequirements.empty(); // This variable represents the returned event. It will always be nullptr if @@ -1478,8 +1481,7 @@ void exec_graph_impl::update(nodes_range Nodes) { auto PartitionedNodes = getURUpdatableNodes(Nodes); for (auto &[PartitionIndex, NodeImpl] : PartitionedNodes) { auto &Partition = MPartitions[PartitionIndex]; - auto CommandBuffer = Partition->MCommandBuffers[MDevice]; - updateURImpl(CommandBuffer, NodeImpl); + updateURImpl(Partition->MCommandBuffer, NodeImpl); } } diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp index eedfcf0506bf3..2e29f950a4f9d 100644 --- a/sycl/source/detail/graph/graph_impl.hpp +++ b/sycl/source/detail/graph/graph_impl.hpp @@ -50,15 +50,16 @@ class dynamic_command_group_impl; class partition { public: /// Constructor. - partition() : MSchedule(), MCommandBuffers() {} + partition() : MSchedule() {} /// List of root nodes. std::set MRoots; /// Execution schedule of nodes in the graph. std::list MSchedule; - /// Map of devices to command buffers. - std::unordered_map - MCommandBuffers; + + /// Command buffer associated with this partition + ur_exp_command_buffer_handle_t MCommandBuffer = nullptr; + /// List of predecessors to this partition. std::vector MPredecessors; @@ -588,7 +589,7 @@ class graph_impl : public std::enable_shared_from_this { }; /// Class representing the implementation of command_graph. -class exec_graph_impl { +class exec_graph_impl : public std::enable_shared_from_this { public: using ReadLock = std::shared_lock; using WriteLock = std::unique_lock; @@ -628,7 +629,7 @@ class exec_graph_impl { /// @return Returns an event if EventNeeded is true. Returns nullptr /// otherwise. EventImplPtr enqueue(sycl::detail::queue_impl &Queue, - sycl::detail::CG::StorageInitHelper CGData, + sycl::detail::CG::StorageInitHelper &CGData, bool EventNeeded); /// Iterates through all the nodes in the graph to build the list of diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index edf32dfa80f7e..479bee005a2ce 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -203,10 +203,7 @@ class handler_impl { /// If we are submitting a graph using ext_oneapi_graph this will be the graph /// to be executed. - std::shared_ptr - MExecGraph; - /// Storage for a node created from a subgraph submission. - std::shared_ptr MSubgraphNode; + ext::oneapi::experimental::detail::exec_graph_impl *MExecGraph; /// Storage for the CG created when handling graph nodes added explicitly. std::unique_ptr MGraphNodeCG; /// Storage for node dependencies passed when adding a graph node explicitly diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 466a314939024..348ecc1edfd32 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -350,9 +350,6 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, // Host and interop tasks, however, are not submitted to low-level runtimes // and require separate dependency management. const CGType Type = HandlerImpl.MCGType; - std::vector Streams; - if (Type == CGType::Kernel) - Streams = std::move(Handler.MStreamStorage); HandlerImpl.MEventMode = SubmitInfo.EventMode(); @@ -360,7 +357,8 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, (Type == CGType::ExecCommandBuffer && HandlerImpl.MExecGraph->containsHostTask()); - auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size(); + auto requiresPostProcess = + SubmitInfo.PostProcessorFunc() || Handler.MStreamStorage.size(); auto noLastEventPath = !isHostTask && MNoLastEventMode.load(std::memory_order_acquire) && !requiresPostProcess; @@ -402,21 +400,23 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, handlerPostProcess(Handler, SubmitInfo.PostProcessorFunc(), Event); } - for (auto &Stream : Streams) { - // We don't want stream flushing to be blocking operation that is why submit - // a host task to print stream buffer. It will fire up as soon as the kernel - // finishes execution. - auto L = [&](handler &ServiceCGH) { - Stream->generateFlushCommand(ServiceCGH); - }; - detail::type_erased_cgfo_ty CGF{L}; - detail::EventImplPtr FlushEvent = - submit_impl(CGF, /*CallerNeedsEvent*/ true, Loc, IsTopCodeLoc, {}); - if (EventImpl) - EventImpl->attachEventToCompleteWeak(FlushEvent); - if (!isInOrder()) { - // For in-order queue, the dependencies will be tracked by LastEvent - registerStreamServiceEvent(FlushEvent); + if (Type == CGType::Kernel) { + for (auto &Stream : Handler.MStreamStorage) { + // We don't want stream flushing to be blocking operation that is why + // submit a host task to print stream buffer. It will fire up as soon as + // the kernel finishes execution. + auto L = [&](handler &ServiceCGH) { + Stream->generateFlushCommand(ServiceCGH); + }; + detail::type_erased_cgfo_ty CGF{L}; + detail::EventImplPtr FlushEvent = + submit_impl(CGF, /*CallerNeedsEvent*/ true, Loc, IsTopCodeLoc, {}); + if (EventImpl) + EventImpl->attachEventToCompleteWeak(FlushEvent); + if (!isInOrder()) { + // For in-order queue, the dependencies will be tracked by LastEvent + registerStreamServiceEvent(FlushEvent); + } } } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 6066ed6b3de50..c5d96d626e418 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -737,7 +737,9 @@ class queue_impl : public std::enable_shared_from_this { MEmpty.store(false, std::memory_order_release); - synchronizeWithExternalEvent(Handler); + if (MInOrderExternalEvent.read_unlocked()) { + synchronizeWithExternalEvent(Handler); + } auto Event = parseEvent(Handler.finalize()); @@ -778,7 +780,9 @@ class queue_impl : public std::enable_shared_from_this { MEmpty = false; MNoLastEventMode = false; - synchronizeWithExternalEvent(Handler); + if (MInOrderExternalEvent.read_unlocked()) { + synchronizeWithExternalEvent(Handler); + } EventToBuildDeps = parseEvent(Handler.finalize()); assert(EventToBuildDeps); @@ -808,7 +812,9 @@ class queue_impl : public std::enable_shared_from_this { MEmpty = false; - synchronizeWithExternalEvent(Handler); + if (MInOrderExternalEvent.read_unlocked()) { + synchronizeWithExternalEvent(Handler); + } EventToBuildDeps = parseEvent(Handler.finalize()); if (EventToBuildDeps) @@ -1010,12 +1016,16 @@ class queue_impl : public std::enable_shared_from_this { } } } + DataType read() { if (!MIsSet.load(std::memory_order_acquire)) return DataType{}; std::lock_guard Lock(MDataMtx); return MData; } + + // To use when the queue is already acquired a mutex lock. + DataType read_unlocked() { return MData; } }; const bool MIsInorder; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 19beb3235e21b..d7845afedfceb 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3845,8 +3845,7 @@ ur_result_t UpdateCommandBufferCommand::enqueueImp() { auto Device = MQueue->get_device(); auto &Partitions = MGraph->getPartitions(); for (auto &[PartitionIndex, NodeImpl] : PartitionedNodes) { - auto CommandBuffer = Partitions[PartitionIndex]->MCommandBuffers[Device]; - MGraph->updateURImpl(CommandBuffer, NodeImpl); + MGraph->updateURImpl(Partitions[PartitionIndex]->MCommandBuffer, NodeImpl); } return UR_RESULT_SUCCESS; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 79bcdbf62aa47..c6bc4a041ddd2 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -485,12 +485,18 @@ event handler::finalize() { #endif const auto &type = getType(); - detail::queue_impl *Queue = impl->get_queue_or_null(); - ext::oneapi::experimental::detail::graph_impl *Graph = - impl->get_graph_or_null(); + detail::queue_impl *const Queue = impl->get_queue_or_null(); + + const bool IsQueueBeingRecorded = Queue && Queue->hasCommandGraph(); + const bool IsExplicitGraphAPI = impl->get_graph_or_null() != nullptr; + const bool IsGraphEnqueue = + impl->MCGType == detail::CGType::ExecCommandBuffer; + const bool IsGraphRelated = + IsQueueBeingRecorded || IsExplicitGraphAPI || IsGraphEnqueue; + const bool KernelFastPath = - (Queue && !Graph && !impl->MSubgraphNode && !Queue->hasCommandGraph() && - !impl->CGData.MRequirements.size() && !MStreamStorage.size() && + (Queue && !IsGraphRelated && !impl->CGData.MRequirements.size() && + !MStreamStorage.size() && detail::Scheduler::areEventsSafeForSchedulerBypass( impl->CGData.MEvents, Queue->getContextImpl())); @@ -507,7 +513,7 @@ event handler::finalize() { // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed // to a command without being bound to a command group, an exception should // be thrown. - { + if (!IsGraphEnqueue) { for (const auto &arg : impl->MArgs) { if (arg.MType != detail::kernel_param_kind_t::kind_accessor) continue; @@ -608,8 +614,8 @@ event handler::finalize() { // the graph is not changed, then this faster path is used to submit // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. - std::vector RawEvents = detail::Command::getUrEvents( - impl->CGData.MEvents, impl->get_queue_or_null(), false); + std::vector RawEvents = + detail::Command::getUrEvents(impl->CGData.MEvents, Queue, false); bool DiscardEvent = !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); @@ -638,9 +644,8 @@ event handler::finalize() { if (xptiEnabled) { std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( detail::GSYCLStreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, - MKernelName.data(), impl->MKernelNameBasedCachePtr, - impl->get_queue_or_null(), impl->MNDRDesc, KernelBundleImpPtr, - impl->MArgs); + MKernelName.data(), impl->MKernelNameBasedCachePtr, Queue, + impl->MNDRDesc, KernelBundleImpPtr, impl->MArgs); detail::emitInstrumentationGeneral(detail::GSYCLStreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); @@ -717,7 +722,7 @@ event handler::finalize() { impl->MNDRDesc, std::move(MHostKernel), std::move(MKernel), std::move(impl->MKernelBundle), std::move(impl->CGData), std::move(impl->MArgs), toKernelNameStrT(MKernelName), - impl->MKernelNameBasedCachePtr, std::move(MStreamStorage), + impl->MKernelNameBasedCachePtr, MStreamStorage, std::move(impl->MAuxiliaryResources), getType(), impl->MKernelCacheConfig, impl->MKernelIsCooperative, impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, @@ -776,7 +781,6 @@ event handler::finalize() { case detail::CGType::EnqueueNativeCommand: case detail::CGType::CodeplayHostTask: { detail::context_impl &Context = impl->get_context(); - detail::queue_impl *Queue = impl->get_queue_or_null(); CommandGroup.reset(new detail::CGHostTask( std::move(impl->MHostTask), Queue, &Context, std::move(impl->MArgs), std::move(impl->CGData), getType(), MCodeLoc)); @@ -825,7 +829,6 @@ event handler::finalize() { break; } case detail::CGType::ExecCommandBuffer: { - detail::queue_impl *Queue = impl->get_queue_or_null(); std::shared_ptr ParentGraph = Queue ? Queue->getCommandGraph() : impl->get_graph().shared_from_this(); @@ -842,15 +845,15 @@ event handler::finalize() { // pass the exec_graph_impl and event dependencies. Since this subgraph CG // will not be executed this is fine. CommandGroup.reset(new sycl::detail::CGExecCommandBuffer( - nullptr, impl->MExecGraph, std::move(impl->CGData))); + nullptr, impl->MExecGraph->shared_from_this(), + std::move(impl->CGData))); } else { - detail::queue_impl &Queue = impl->get_queue(); bool DiscardEvent = !impl->MEventNeeded && - Queue.supportsDiscardingPiEvents() && + Queue->supportsDiscardingPiEvents() && !impl->MExecGraph->containsHostTask(); - detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue( - Queue, std::move(impl->CGData), !DiscardEvent); + detail::EventImplPtr GraphCompletionEvent = + impl->MExecGraph->enqueue(*Queue, impl->CGData, !DiscardEvent); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES return GraphCompletionEvent; #else @@ -903,7 +906,7 @@ event handler::finalize() { // If there is a graph associated with the handler we are in the explicit // graph mode, so we store the CG instead of submitting it to the scheduler, // so it can be retrieved by the graph later. - if (impl->get_graph_or_null()) { + if (IsExplicitGraphAPI) { impl->MGraphNodeCG = std::move(CommandGroup); auto EventImpl = detail::event_impl::create_completed_host_event(); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES @@ -918,7 +921,8 @@ event handler::finalize() { // If the queue has an associated graph then we need to take the CG and pass // it to the graph to create a node, rather than submit it to the scheduler. - if (auto GraphImpl = Queue->getCommandGraph(); GraphImpl) { + if (IsQueueBeingRecorded) { + auto GraphImpl = Queue->getCommandGraph(); auto EventImpl = detail::event_impl::create_completed_host_event(); EventImpl->setSubmittedQueue(Queue->weak_from_this()); ext::oneapi::experimental::detail::node_impl *NodeImpl = nullptr; @@ -2364,11 +2368,17 @@ void handler::setKernelWorkGroupMem(size_t Size) { } void handler::ext_oneapi_graph( +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::executable> - Graph) { + Graph +#else + const ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &Graph +#endif +) { setType(detail::CGType::ExecCommandBuffer); - impl->MExecGraph = detail::getSyclObjImpl(Graph); + impl->MExecGraph = detail::getSyclObjImpl(Graph).get(); } std::shared_ptr diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index 04d2f81281de2..e5cf1fbe4f005 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -61,8 +61,7 @@ class LimitedHandler { handler_impl(std::shared_ptr Queue) : MQueue(Queue) {} std::shared_ptr MQueue; MockQueueImpl &get_queue() { return *MQueue; } - std::shared_ptr - MExecGraph; + ext::oneapi::experimental::detail::exec_graph_impl *MExecGraph; }; std::shared_ptr impl; std::shared_ptr MKernel;