From bb31b3435d991c69f847cc9246114ac77e1ac823 Mon Sep 17 00:00:00 2001 From: Fabio Mestre Date: Thu, 26 Jun 2025 18:10:15 +0100 Subject: [PATCH 1/6] Experimental graph optimizations --- sycl/include/sycl/handler.hpp | 6 +- sycl/source/detail/graph/graph_impl.cpp | 66 ++++++++++--------- sycl/source/detail/graph/graph_impl.hpp | 13 ++-- sycl/source/detail/handler_impl.hpp | 5 +- sycl/source/detail/queue_impl.cpp | 39 +++++------ sycl/source/detail/queue_impl.hpp | 18 ++++- sycl/source/detail/scheduler/commands.cpp | 3 +- sycl/source/handler.cpp | 53 ++++++++------- .../ext_oneapi_enqueue_functions.cpp | 17 +++++ .../scheduler/InOrderQueueSyncCheck.cpp | 3 +- 10 files changed, 127 insertions(+), 96 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index e074e5f1ada42..e2d5085708c1f 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -3044,9 +3044,9 @@ class __SYCL_EXPORT handler { /// Executes a command_graph. /// /// \param Graph Executable command_graph to run - void ext_oneapi_graph(ext::oneapi::experimental::command_graph< - ext::oneapi::experimental::graph_state::executable> - Graph); + void + ext_oneapi_graph(const ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &Graph); /// Copies data from host to device, where \p Src is a USM pointer and \p Dest /// is an opaque image memory handle. An exception is thrown if either \p Src diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index ca68ebde68512..e5df19dc30f23 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -916,7 +916,7 @@ void exec_graph_impl::createCommandBuffers( throw sycl::exception(errc::invalid, "Failed to create UR command-buffer"); } - Partition->MCommandBuffers[Device] = OutCommandBuffer; + Partition->MCommandBuffer = OutCommandBuffer; for (const auto &Node : Partition->MSchedule) { // Some nodes are not scheduled like other nodes, and only their @@ -991,13 +991,12 @@ exec_graph_impl::~exec_graph_impl() { for (const auto &Partition : MPartitions) { Partition->MSchedule.clear(); - for (const auto &Iter : Partition->MCommandBuffers) { - if (auto CmdBuf = Iter.second; CmdBuf) { - ur_result_t Res = Adapter->call_nocheck< - sycl::detail::UrApiKind::urCommandBufferReleaseExp>(CmdBuf); - (void)Res; - assert(Res == UR_RESULT_SUCCESS); - } + if (Partition->MCommandBuffer) { + ur_result_t Res = Adapter->call_nocheck< + sycl::detail::UrApiKind::urCommandBufferReleaseExp>( + Partition->MCommandBuffer); + (void)Res; + assert(Res == UR_RESULT_SUCCESS); } } } catch (std::exception &e) { @@ -1069,11 +1068,9 @@ EventImplPtr exec_graph_impl::enqueuePartitionWithScheduler( Partition->MAccessors.end()); } - auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; - std::unique_ptr CommandGroup = std::make_unique( - CommandBuffer, nullptr, std::move(CGData)); + Partition->MCommandBuffer, nullptr, std::move(CGData)); EventImplPtr SchedulerEvent = sycl::detail::Scheduler::getInstance().addCG( std::move(CommandGroup), Queue, EventNeeded); @@ -1090,27 +1087,30 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( std::shared_ptr &Partition, sycl::detail::queue_impl &Queue, std::vector &WaitEvents, bool EventNeeded) { + ur_event_handle_t *UrEnqueueWaitList = nullptr; + size_t UrEnqueueWaitListSize = 0; + // Create a list containing all the UR event handles in WaitEvents. WaitEvents // is assumed to be safe for scheduler bypass and any host-task events that it // contains can be ignored. std::vector UrEventHandles{}; - UrEventHandles.reserve(WaitEvents.size()); - for (auto &SyclWaitEvent : WaitEvents) { - if (auto URHandle = SyclWaitEvent->getHandle()) { - UrEventHandles.push_back(URHandle); + if (!WaitEvents.empty()) { + UrEventHandles.reserve(WaitEvents.size()); + for (auto &SyclWaitEvent : WaitEvents) { + if (auto URHandle = SyclWaitEvent->getHandle()) { + UrEventHandles.push_back(URHandle); + } } - } - auto CommandBuffer = Partition->MCommandBuffers[Queue.get_device()]; - const size_t UrEnqueueWaitListSize = UrEventHandles.size(); - const ur_event_handle_t *UrEnqueueWaitList = - UrEnqueueWaitListSize == 0 ? nullptr : UrEventHandles.data(); + UrEnqueueWaitList = UrEventHandles.data(); + UrEnqueueWaitListSize = UrEventHandles.size(); + } if (!EventNeeded) { Queue.getAdapter() ->call( - Queue.getHandleRef(), CommandBuffer, UrEnqueueWaitListSize, - UrEnqueueWaitList, nullptr); + Queue.getHandleRef(), Partition->MCommandBuffer, + UrEnqueueWaitListSize, UrEnqueueWaitList, nullptr); return nullptr; } else { auto NewEvent = sycl::detail::event_impl::create_device_event(Queue); @@ -1120,8 +1120,8 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( ur_event_handle_t UrEvent = nullptr; Queue.getAdapter() ->call( - Queue.getHandleRef(), CommandBuffer, UrEventHandles.size(), - UrEnqueueWaitList, &UrEvent); + Queue.getHandleRef(), Partition->MCommandBuffer, + UrEventHandles.size(), UrEnqueueWaitList, &UrEvent); NewEvent->setHandle(UrEvent); NewEvent->setEventFromSubmittedExecCommandBuffer(true); return NewEvent; @@ -1244,17 +1244,20 @@ exec_graph_impl::enqueuePartitions(sycl::detail::queue_impl &Queue, EventImplPtr exec_graph_impl::enqueue(sycl::detail::queue_impl &Queue, - sycl::detail::CG::StorageInitHelper CGData, + sycl::detail::CG::StorageInitHelper &CGData, bool EventNeeded) { WriteLock Lock(MMutex); - cleanupExecutionEvents(MSchedulerDependencies); - CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), - MSchedulerDependencies.end()); + if (!MSchedulerDependencies.empty()) { + cleanupExecutionEvents(MSchedulerDependencies); + CGData.MEvents.insert(CGData.MEvents.end(), MSchedulerDependencies.begin(), + MSchedulerDependencies.end()); + } bool IsCGDataSafeForSchedulerBypass = - detail::Scheduler::areEventsSafeForSchedulerBypass( - CGData.MEvents, Queue.getContextImpl()) && + (CGData.MEvents.empty() || + detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, Queue.getContextImpl())) && CGData.MRequirements.empty(); // This variable represents the returned event. It will always be nullptr if @@ -1569,8 +1572,7 @@ void exec_graph_impl::update( auto PartitionedNodes = getURUpdatableNodes(Nodes); for (auto &[PartitionIndex, NodeImpl] : PartitionedNodes) { auto &Partition = MPartitions[PartitionIndex]; - auto CommandBuffer = Partition->MCommandBuffers[MDevice]; - updateURImpl(CommandBuffer, NodeImpl); + updateURImpl(Partition->MCommandBuffer, NodeImpl); } } diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp index b6cac2eab6f8a..78f3844944f99 100644 --- a/sycl/source/detail/graph/graph_impl.hpp +++ b/sycl/source/detail/graph/graph_impl.hpp @@ -50,16 +50,17 @@ class dynamic_command_group_impl; class partition { public: /// Constructor. - partition() : MSchedule(), MCommandBuffers() {} + partition() : MSchedule() {} /// List of root nodes. std::set, std::owner_less>> MRoots; /// Execution schedule of nodes in the graph. std::list> MSchedule; - /// Map of devices to command buffers. - std::unordered_map - MCommandBuffers; + + /// Command buffer associated with this partition + ur_exp_command_buffer_handle_t MCommandBuffer = nullptr; + /// List of predecessors to this partition. std::vector MPredecessors; @@ -602,7 +603,7 @@ class graph_impl : public std::enable_shared_from_this { }; /// Class representing the implementation of command_graph. -class exec_graph_impl { +class exec_graph_impl : public std::enable_shared_from_this { public: using ReadLock = std::shared_lock; using WriteLock = std::unique_lock; @@ -642,7 +643,7 @@ class exec_graph_impl { /// @return Returns an event if EventNeeded is true. Returns nullptr /// otherwise. EventImplPtr enqueue(sycl::detail::queue_impl &Queue, - sycl::detail::CG::StorageInitHelper CGData, + sycl::detail::CG::StorageInitHelper &CGData, bool EventNeeded); /// Iterates through all the nodes in the graph to build the list of diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 0fda3dd4f2769..3a5d2fd005c63 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -204,10 +204,7 @@ class handler_impl { /// If we are submitting a graph using ext_oneapi_graph this will be the graph /// to be executed. - std::shared_ptr - MExecGraph; - /// Storage for a node created from a subgraph submission. - std::shared_ptr MSubgraphNode; + ext::oneapi::experimental::detail::exec_graph_impl* MExecGraph; /// Storage for the CG created when handling graph nodes added explicitly. std::unique_ptr MGraphNodeCG; /// Storage for node dependencies passed when adding a graph node explicitly diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index d5b20c6e55527..fb97e6ffbb27b 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -330,9 +330,6 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, // Host and interop tasks, however, are not submitted to low-level runtimes // and require separate dependency management. const CGType Type = HandlerImpl->MCGType; - std::vector Streams; - if (Type == CGType::Kernel) - Streams = std::move(Handler.MStreamStorage); HandlerImpl->MEventMode = SubmitInfo.EventMode(); @@ -340,7 +337,8 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, (Type == CGType::ExecCommandBuffer && HandlerImpl->MExecGraph->containsHostTask()); - auto requiresPostProcess = SubmitInfo.PostProcessorFunc() || Streams.size(); + auto requiresPostProcess = + SubmitInfo.PostProcessorFunc() || Handler.MStreamStorage.size(); auto noLastEventPath = !isHostTask && MNoLastEventMode.load(std::memory_order_acquire) && !requiresPostProcess; @@ -382,21 +380,24 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, handlerPostProcess(Handler, SubmitInfo.PostProcessorFunc(), Event); } - for (auto &Stream : Streams) { - // We don't want stream flushing to be blocking operation that is why submit - // a host task to print stream buffer. It will fire up as soon as the kernel - // finishes execution. - auto L = [&](handler &ServiceCGH) { - Stream->generateFlushCommand(ServiceCGH); - }; - detail::type_erased_cgfo_ty CGF{L}; - detail::EventImplPtr FlushEvent = submit_impl( - CGF, SecondaryQueue, /*CallerNeedsEvent*/ true, Loc, IsTopCodeLoc, {}); - if (EventImpl) - EventImpl->attachEventToCompleteWeak(FlushEvent); - if (!isInOrder()) { - // For in-order queue, the dependencies will be tracked by LastEvent - registerStreamServiceEvent(FlushEvent); + if (Type == CGType::Kernel) { + for (auto &Stream : Handler.MStreamStorage) { + // We don't want stream flushing to be blocking operation that is why + // submit a host task to print stream buffer. It will fire up as soon as + // the kernel finishes execution. + auto L = [&](handler &ServiceCGH) { + Stream->generateFlushCommand(ServiceCGH); + }; + detail::type_erased_cgfo_ty CGF{L}; + detail::EventImplPtr FlushEvent = + submit_impl(CGF, SecondaryQueue, /*CallerNeedsEvent*/ true, Loc, + IsTopCodeLoc, {}); + if (EventImpl) + EventImpl->attachEventToCompleteWeak(FlushEvent); + if (!isInOrder()) { + // For in-order queue, the dependencies will be tracked by LastEvent + registerStreamServiceEvent(FlushEvent); + } } } diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index d6892011b991e..04cd3e5a294fd 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -743,7 +743,9 @@ class queue_impl : public std::enable_shared_from_this { MEmpty.store(false, std::memory_order_release); - synchronizeWithExternalEvent(Handler); + if (MInOrderExternalEvent.read_unlocked()) { + synchronizeWithExternalEvent(Handler); + } auto Event = parseEvent(Handler.finalize()); @@ -783,7 +785,9 @@ class queue_impl : public std::enable_shared_from_this { MEmpty = false; MNoLastEventMode = false; - synchronizeWithExternalEvent(Handler); + if (MInOrderExternalEvent.read_unlocked()) { + synchronizeWithExternalEvent(Handler); + } EventToBuildDeps = parseEvent(Handler.finalize()); assert(EventToBuildDeps); @@ -813,7 +817,9 @@ class queue_impl : public std::enable_shared_from_this { MEmpty = false; - synchronizeWithExternalEvent(Handler); + if (MInOrderExternalEvent.read_unlocked()) { + synchronizeWithExternalEvent(Handler); + } EventToBuildDeps = parseEvent(Handler.finalize()); if (EventToBuildDeps) @@ -1036,12 +1042,18 @@ class queue_impl : public std::enable_shared_from_this { } } } + DataType read() { if (!MIsSet.load(std::memory_order_acquire)) return DataType{}; std::lock_guard Lock(MDataMtx); return MData; } + + // To use when the queue is already acquired a mutex lock. + DataType read_unlocked() { + return MData; + } }; const bool MIsInorder; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 12de67aac105f..add08ff929501 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -3850,8 +3850,7 @@ ur_result_t UpdateCommandBufferCommand::enqueueImp() { auto Device = MQueue->get_device(); auto &Partitions = MGraph->getPartitions(); for (auto &[PartitionIndex, NodeImpl] : PartitionedNodes) { - auto CommandBuffer = Partitions[PartitionIndex]->MCommandBuffers[Device]; - MGraph->updateURImpl(CommandBuffer, NodeImpl); + MGraph->updateURImpl(Partitions[PartitionIndex]->MCommandBuffer, NodeImpl); } return UR_RESULT_SUCCESS; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 4444e519b3515..e7515d97bd4d6 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -433,12 +433,18 @@ detail::EventImplPtr handler::finalize() { event handler::finalize() { #endif const auto &type = getType(); - detail::queue_impl *Queue = impl->get_queue_or_null(); - ext::oneapi::experimental::detail::graph_impl *Graph = - impl->get_graph_or_null(); + detail::queue_impl *const Queue = impl->get_queue_or_null(); + + const bool IsQueueBeingRecorded = Queue && Queue->hasCommandGraph(); + const bool IsExplicitGraphAPI = impl->get_graph_or_null() != nullptr; + const bool IsGraphEnqueue = + impl->MCGType == detail::CGType::ExecCommandBuffer; + const bool IsGraphRelated = + IsQueueBeingRecorded || IsExplicitGraphAPI || IsGraphEnqueue; + const bool KernelFastPath = - (Queue && !Graph && !impl->MSubgraphNode && !Queue->hasCommandGraph() && - !impl->CGData.MRequirements.size() && !MStreamStorage.size() && + (Queue && !IsGraphRelated && !impl->CGData.MRequirements.size() && + !MStreamStorage.size() && detail::Scheduler::areEventsSafeForSchedulerBypass( impl->CGData.MEvents, Queue->getContextImpl())); @@ -455,6 +461,7 @@ event handler::finalize() { // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed // to a command without being bound to a command group, an exception should // be thrown. + if (!IsGraphEnqueue) { for (const auto &arg : impl->MArgs) { if (arg.MType != detail::kernel_param_kind_t::kind_accessor) @@ -557,7 +564,7 @@ event handler::finalize() { // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. std::vector RawEvents = detail::Command::getUrEvents( - impl->CGData.MEvents, impl->get_queue_or_null(), false); + impl->CGData.MEvents, Queue, false); bool DiscardEvent = !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); @@ -588,7 +595,7 @@ event handler::finalize() { std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( StreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, MKernelName.data(), impl->MKernelNameBasedCachePtr, - impl->get_queue_or_null(), impl->MNDRDesc, KernelBundleImpPtr, + Queue, impl->MNDRDesc, KernelBundleImpPtr, impl->MArgs); detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, @@ -627,9 +634,8 @@ event handler::finalize() { if (DiscardEvent) { EnqueueKernel(); } else { - detail::queue_impl &Queue = impl->get_queue(); - ResultEvent->setQueue(Queue); - ResultEvent->setWorkerQueue(Queue.weak_from_this()); + ResultEvent->setQueue(*Queue); + ResultEvent->setWorkerQueue(Queue->weak_from_this()); ResultEvent->setContextImpl(impl->get_context()); ResultEvent->setStateIncomplete(); ResultEvent->setSubmissionTime(); @@ -637,7 +643,7 @@ event handler::finalize() { EnqueueKernel(); ResultEvent->setEnqueued(); // connect returned event with dependent events - if (!Queue.isInOrder()) { + if (!Queue->isInOrder()) { // MEvents is not used anymore, so can move. ResultEvent->getPreparedDepsEvents() = std::move(impl->CGData.MEvents); @@ -667,7 +673,7 @@ event handler::finalize() { std::move(impl->MNDRDesc), std::move(MHostKernel), std::move(MKernel), std::move(impl->MKernelBundle), std::move(impl->CGData), std::move(impl->MArgs), toKernelNameStrT(MKernelName), - impl->MKernelNameBasedCachePtr, std::move(MStreamStorage), + impl->MKernelNameBasedCachePtr, MStreamStorage, std::move(impl->MAuxiliaryResources), getType(), impl->MKernelCacheConfig, impl->MKernelIsCooperative, impl->MKernelUsesClusterLaunch, impl->MKernelWorkGroupMemorySize, @@ -725,7 +731,6 @@ event handler::finalize() { case detail::CGType::EnqueueNativeCommand: case detail::CGType::CodeplayHostTask: { detail::context_impl &Context = impl->get_context(); - detail::queue_impl *Queue = impl->get_queue_or_null(); CommandGroup.reset(new detail::CGHostTask( std::move(impl->MHostTask), Queue, &Context, std::move(impl->MArgs), std::move(impl->CGData), getType(), MCodeLoc)); @@ -774,7 +779,6 @@ event handler::finalize() { break; } case detail::CGType::ExecCommandBuffer: { - detail::queue_impl *Queue = impl->get_queue_or_null(); std::shared_ptr ParentGraph = Queue ? Queue->getCommandGraph() : impl->get_graph().shared_from_this(); @@ -791,15 +795,14 @@ event handler::finalize() { // pass the exec_graph_impl and event dependencies. Since this subgraph CG // will not be executed this is fine. CommandGroup.reset(new sycl::detail::CGExecCommandBuffer( - nullptr, impl->MExecGraph, std::move(impl->CGData))); + nullptr, impl->MExecGraph->shared_from_this(), std::move(impl->CGData))); } else { - detail::queue_impl &Queue = impl->get_queue(); bool DiscardEvent = !impl->MEventNeeded && - Queue.supportsDiscardingPiEvents() && + Queue->supportsDiscardingPiEvents() && !impl->MExecGraph->containsHostTask(); - detail::EventImplPtr GraphCompletionEvent = impl->MExecGraph->enqueue( - Queue, std::move(impl->CGData), !DiscardEvent); + detail::EventImplPtr GraphCompletionEvent = + impl->MExecGraph->enqueue(*Queue, impl->CGData, !DiscardEvent); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES return GraphCompletionEvent; #else @@ -852,7 +855,7 @@ event handler::finalize() { // If there is a graph associated with the handler we are in the explicit // graph mode, so we store the CG instead of submitting it to the scheduler, // so it can be retrieved by the graph later. - if (impl->get_graph_or_null()) { + if (IsExplicitGraphAPI) { impl->MGraphNodeCG = std::move(CommandGroup); auto EventImpl = detail::event_impl::create_completed_host_event(); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES @@ -867,7 +870,8 @@ event handler::finalize() { // If the queue has an associated graph then we need to take the CG and pass // it to the graph to create a node, rather than submit it to the scheduler. - if (auto GraphImpl = Queue->getCommandGraph(); GraphImpl) { + if (IsQueueBeingRecorded) { + auto GraphImpl = Queue->getCommandGraph(); auto EventImpl = detail::event_impl::create_completed_host_event(); EventImpl->setSubmittedQueue(Queue->weak_from_this()); std::shared_ptr NodeImpl = @@ -2256,11 +2260,10 @@ void handler::setKernelWorkGroupMem(size_t Size) { } void handler::ext_oneapi_graph( - ext::oneapi::experimental::command_graph< - ext::oneapi::experimental::graph_state::executable> - Graph) { + const ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> &Graph) { setType(detail::CGType::ExecCommandBuffer); - impl->MExecGraph = detail::getSyclObjImpl(Graph); + impl->MExecGraph = detail::getSyclObjImpl(Graph).get(); } std::shared_ptr diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp index aa3c32beb65fe..b21dfec4b3e4f 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp @@ -12,6 +12,11 @@ #include int main() { + + using namespace std::chrono; + + + queue InOrderQueue{property::queue::in_order{}}; using T = int; @@ -43,13 +48,25 @@ int main() { auto GraphExec = Graph.finalize(); + // Start time + auto start = high_resolution_clock::now(); exp_ext::execute_graph(InOrderQueue, GraphExec); + + auto end = high_resolution_clock::now(); + + // Calculatce duration + auto duration = duration_cast(end - start); + + std::cout << "Time taken by function: " << duration.count() << " ns\n"; + InOrderQueue.wait_and_throw(); free(PtrA, InOrderQueue); free(PtrB, InOrderQueue); free(PtrC, InOrderQueue); + + for (size_t i = 0; i < Size; i++) { T Ref = Pattern * i; assert(check_value(i, Ref, Output[i], "Output")); diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index b6380276e5826..371f52da0fd3a 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -61,8 +61,7 @@ class LimitedHandler { handler_impl(std::shared_ptr Queue) : MQueue(Queue) {} std::shared_ptr MQueue; MockQueueImpl &get_queue() { return *MQueue; } - std::shared_ptr - MExecGraph; + ext::oneapi::experimental::detail::exec_graph_impl* MExecGraph; }; std::shared_ptr impl; std::shared_ptr MKernel; From ae3f22f9e0edd9d978760089b7b9cc97b5a74679 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 24 Jul 2025 08:11:28 -0700 Subject: [PATCH 2/6] formatting --- sycl/source/detail/graph/graph_impl.cpp | 8 ++++---- sycl/source/detail/handler_impl.hpp | 2 +- sycl/source/detail/queue_impl.hpp | 7 +++---- sycl/source/handler.cpp | 15 +++++++-------- .../RecordReplay/ext_oneapi_enqueue_functions.cpp | 4 ---- .../unittests/scheduler/InOrderQueueSyncCheck.cpp | 2 +- 6 files changed, 16 insertions(+), 22 deletions(-) diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 116b94ed91ba8..5a3bf07ef690b 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -1026,8 +1026,8 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( if (!EventNeeded) { Queue.getAdapter().call( - Queue.getHandleRef(), Partition->MCommandBuffer, - UrEnqueueWaitListSize, UrEnqueueWaitList, nullptr); + Queue.getHandleRef(), Partition->MCommandBuffer, UrEnqueueWaitListSize, + UrEnqueueWaitList, nullptr); return nullptr; } else { auto NewEvent = sycl::detail::event_impl::create_device_event(Queue); @@ -1036,8 +1036,8 @@ EventImplPtr exec_graph_impl::enqueuePartitionDirectly( NewEvent->setSubmissionTime(); ur_event_handle_t UrEvent = nullptr; Queue.getAdapter().call( - Queue.getHandleRef(), Partition->MCommandBuffer, - UrEventHandles.size(), UrEnqueueWaitList, &UrEvent); + Queue.getHandleRef(), Partition->MCommandBuffer, UrEventHandles.size(), + UrEnqueueWaitList, &UrEvent); NewEvent->setHandle(UrEvent); NewEvent->setEventFromSubmittedExecCommandBuffer(true); return NewEvent; diff --git a/sycl/source/detail/handler_impl.hpp b/sycl/source/detail/handler_impl.hpp index 3a5d2fd005c63..7d2a543f7da75 100644 --- a/sycl/source/detail/handler_impl.hpp +++ b/sycl/source/detail/handler_impl.hpp @@ -204,7 +204,7 @@ class handler_impl { /// If we are submitting a graph using ext_oneapi_graph this will be the graph /// to be executed. - ext::oneapi::experimental::detail::exec_graph_impl* MExecGraph; + ext::oneapi::experimental::detail::exec_graph_impl *MExecGraph; /// Storage for the CG created when handling graph nodes added explicitly. std::unique_ptr MGraphNodeCG; /// Storage for node dependencies passed when adding a graph node explicitly diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e1bae5c7314aa..62a89ece3bcef 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -276,7 +276,8 @@ class queue_impl : public std::enable_shared_from_this { ur_native_handle_t nativeHandle = 0; getAdapter().call(MQueue, nullptr, &nativeHandle); - __SYCL_OCL_CALL(clRetainCommandQueue, ur::cast(nativeHandle)); + __SYCL_OCL_CALL(clRetainCommandQueue, + ur::cast(nativeHandle)); return ur::cast(nativeHandle); } @@ -1052,9 +1053,7 @@ class queue_impl : public std::enable_shared_from_this { } // To use when the queue is already acquired a mutex lock. - DataType read_unlocked() { - return MData; - } + DataType read_unlocked() { return MData; } }; const bool MIsInorder; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 71eafda25d1ce..98333e8e5e3a2 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -483,8 +483,7 @@ event handler::finalize() { // According to 4.7.6.9 of SYCL2020 spec, if a placeholder accessor is passed // to a command without being bound to a command group, an exception should // be thrown. - if (!IsGraphEnqueue) - { + if (!IsGraphEnqueue) { for (const auto &arg : impl->MArgs) { if (arg.MType != detail::kernel_param_kind_t::kind_accessor) continue; @@ -585,8 +584,8 @@ event handler::finalize() { // the graph is not changed, then this faster path is used to submit // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. - std::vector RawEvents = detail::Command::getUrEvents( - impl->CGData.MEvents, Queue, false); + std::vector RawEvents = + detail::Command::getUrEvents(impl->CGData.MEvents, Queue, false); bool DiscardEvent = !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); @@ -616,9 +615,8 @@ event handler::finalize() { StreamID = xptiRegisterStream(detail::SYCL_STREAM_NAME); std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( StreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, - MKernelName.data(), impl->MKernelNameBasedCachePtr, - Queue, impl->MNDRDesc, KernelBundleImpPtr, - impl->MArgs); + MKernelName.data(), impl->MKernelNameBasedCachePtr, Queue, + impl->MNDRDesc, KernelBundleImpPtr, impl->MArgs); detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, xpti::trace_task_begin, nullptr); @@ -816,7 +814,8 @@ event handler::finalize() { // pass the exec_graph_impl and event dependencies. Since this subgraph CG // will not be executed this is fine. CommandGroup.reset(new sycl::detail::CGExecCommandBuffer( - nullptr, impl->MExecGraph->shared_from_this(), std::move(impl->CGData))); + nullptr, impl->MExecGraph->shared_from_this(), + std::move(impl->CGData))); } else { bool DiscardEvent = !impl->MEventNeeded && diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp index b21dfec4b3e4f..28052cd1f96c3 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp @@ -15,8 +15,6 @@ int main() { using namespace std::chrono; - - queue InOrderQueue{property::queue::in_order{}}; using T = int; @@ -65,8 +63,6 @@ int main() { free(PtrB, InOrderQueue); free(PtrC, InOrderQueue); - - for (size_t i = 0; i < Size; i++) { T Ref = Pattern * i; assert(check_value(i, Ref, Output[i], "Output")); diff --git a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp index 371f52da0fd3a..bf8819da80754 100644 --- a/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp +++ b/sycl/unittests/scheduler/InOrderQueueSyncCheck.cpp @@ -61,7 +61,7 @@ class LimitedHandler { handler_impl(std::shared_ptr Queue) : MQueue(Queue) {} std::shared_ptr MQueue; MockQueueImpl &get_queue() { return *MQueue; } - ext::oneapi::experimental::detail::exec_graph_impl* MExecGraph; + ext::oneapi::experimental::detail::exec_graph_impl *MExecGraph; }; std::shared_ptr impl; std::shared_ptr MKernel; From ed906e74f60320ac083213bf2f7980692f1ab01f Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 24 Jul 2025 08:12:12 -0700 Subject: [PATCH 3/6] merge conflict --- sycl/source/handler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 98333e8e5e3a2..84ba369aa5249 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -662,7 +662,7 @@ event handler::finalize() { EnqueueKernel(); ResultEvent->setEnqueued(); // connect returned event with dependent events - if (!Queue->isInOrder()) { + if (!Queue.isInOrder()) { // MEvents is not used anymore, so can move. ResultEvent->getPreparedDepsEvents() = std::move(impl->CGData.MEvents); From f9e7208c0bbd7b5fbe5e1271375756b567fea507 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 24 Jul 2025 08:58:13 -0700 Subject: [PATCH 4/6] merge conflict --- sycl/source/detail/graph/graph_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 5a3bf07ef690b..6c50bf23ba467 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -910,7 +910,7 @@ exec_graph_impl::~exec_graph_impl() { for (const auto &Partition : MPartitions) { Partition->MSchedule.clear(); if (Partition->MCommandBuffer) { - ur_result_t Res = Adapter->call_nocheck< + ur_result_t Res = Adapter.call_nocheck< sycl::detail::UrApiKind::urCommandBufferReleaseExp>( Partition->MCommandBuffer); (void)Res; From 847aad91a1904a98cdb5d023414c49dbd6945df5 Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Thu, 24 Jul 2025 19:29:18 -0700 Subject: [PATCH 5/6] guard abi breaking changes --- sycl/include/sycl/handler.hpp | 7 ++++++- sycl/source/handler.cpp | 9 ++++++++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index be485e85066f0..6768157b1de79 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -2824,10 +2824,15 @@ class __SYCL_EXPORT handler { /// Executes a command_graph. /// /// \param Graph Executable command_graph to run +#ifdef __INTEL_PREVIEW_BREAKING_CHANGES void ext_oneapi_graph(const ext::oneapi::experimental::command_graph< ext::oneapi::experimental::graph_state::executable> &Graph); - +#else + void ext_oneapi_graph(ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> + Graph); +#endif /// Copies data from host to device, where \p Src is a USM pointer and \p Dest /// is an opaque image memory handle. An exception is thrown if either \p Src /// is nullptr or \p Dest is incomplete. The behavior is undefined if diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 84ba369aa5249..71ab91188423a 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -2323,8 +2323,15 @@ void handler::setKernelWorkGroupMem(size_t Size) { } void handler::ext_oneapi_graph( +#ifndef __INTEL_PREVIEW_BREAKING_CHANGES + ext::oneapi::experimental::command_graph< + ext::oneapi::experimental::graph_state::executable> + Graph +#else const ext::oneapi::experimental::command_graph< - ext::oneapi::experimental::graph_state::executable> &Graph) { + ext::oneapi::experimental::graph_state::executable> &Graph +#endif +) { setType(detail::CGType::ExecCommandBuffer); impl->MExecGraph = detail::getSyclObjImpl(Graph).get(); } From 2a72f53087c9d21dc510bd7a9144c474ce11a63a Mon Sep 17 00:00:00 2001 From: Pablo Reble Date: Wed, 30 Jul 2025 09:12:14 -0700 Subject: [PATCH 6/6] Revert changes on e2e-test --- .../RecordReplay/ext_oneapi_enqueue_functions.cpp | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp index 527638f83e42a..03bd54face2a1 100644 --- a/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/ext_oneapi_enqueue_functions.cpp @@ -10,9 +10,6 @@ #include int main() { - - using namespace std::chrono; - queue InOrderQueue{property::queue::in_order{}}; using T = int; @@ -44,17 +41,7 @@ int main() { auto GraphExec = Graph.finalize(); - // Start time - auto start = high_resolution_clock::now(); exp_ext::execute_graph(InOrderQueue, GraphExec); - - auto end = high_resolution_clock::now(); - - // Calculatce duration - auto duration = duration_cast(end - start); - - std::cout << "Time taken by function: " << duration.count() << " ns\n"; - InOrderQueue.wait_and_throw(); free(PtrA, InOrderQueue);