From 1d1bf166284cc72a5285c6938319cf846b29922f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 29 Sep 2025 09:39:16 +0000 Subject: [PATCH 01/10] [SYCL] Add scheduler-bypass for handler-less kernel submission path The handler-less kernel submission path has been extended to support the fast, scheduler-bypass submission. --- sycl/source/detail/queue_impl.cpp | 112 ++++++++++++++++++++++-------- 1 file changed, 83 insertions(+), 29 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 79769d8819000..fc62f0ab3fd80 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -432,30 +432,75 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( KData.setKernelFunc(HostKernel->getPtr()); KData.setNDRDesc(NDRDesc); - auto SubmitKernelFunc = - [&](detail::CG::StorageInitHelper &CGData) -> EventImplPtr { - std::unique_ptr CommandGroup; - std::vector> StreamStorage; - std::vector> AuxiliaryResources; - - KData.extractArgsAndReqsFromLambda(); - - CommandGroup.reset(new detail::CGExecKernel( - KData.getNDRDesc(), HostKernel, - nullptr, // Kernel - nullptr, // KernelBundle - std::move(CGData), std::move(KData).getArgs(), - *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // KernelIsCooperative - false, // KernelUsesClusterLaunch - 0, // KernelWorkGroupMemorySize - CodeLoc)); - CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; - - return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), - *this, true); + auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, + bool SchedulerBypass) -> EventImplPtr { + if (SchedulerBypass) { + bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); + std::vector RawEvents; + + if (CGData.MEvents.size() > 0) { + RawEvents = detail::Command::getUrEvents(CGData.MEvents, this, false); + } + + std::shared_ptr ResultEvent = + DiscardEvent ? nullptr + : detail::event_impl::create_device_event(*this); + + if (!DiscardEvent) { + ResultEvent->setWorkerQueue(weak_from_this()); + ResultEvent->setStateIncomplete(); + ResultEvent->setSubmissionTime(); + } + + enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(), + nullptr, // KernelBundle + nullptr, // Kernel + *KData.getDeviceKernelInfoPtr(), RawEvents, + ResultEvent.get(), + nullptr, // getMemAllocationFunc + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // KernelIsCooperative + false, // KernelUsesClusterLaunch + 0, // WorkGroupMemorySize + nullptr, // BinImage + KData.getKernelFuncPtr()); + + if (!DiscardEvent) { + ResultEvent->setEnqueued(); + // connect returned event with dependent events + if (!isInOrder()) { + // MEvents is not used anymore, so can move. + ResultEvent->getPreparedDepsEvents() = std::move(CGData.MEvents); + // ResultEvent is local for current thread, no need to lock. + ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); + } + } + + return ResultEvent; + } else { + std::unique_ptr CommandGroup; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + KData.extractArgsAndReqsFromLambda(); + + CommandGroup.reset(new detail::CGExecKernel( + KData.getNDRDesc(), HostKernel, + nullptr, // Kernel + nullptr, // KernelBundle + std::move(CGData), std::move(KData).getArgs(), + *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + UR_KERNEL_CACHE_CONFIG_DEFAULT, + false, // KernelIsCooperative + false, // KernelUsesClusterLaunch + 0, // KernelWorkGroupMemorySize + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), + *this, true); + } }; return submit_direct(CallerNeedsEvent, SubmitKernelFunc); @@ -505,15 +550,24 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } } - EventImplPtr EventImpl = SubmitCommandFunc(CGData); + bool SchedulerBypass = + CGData.MEvents.size() > 0 + ? detail::Scheduler::areEventsSafeForSchedulerBypass(CGData.MEvents, + getContextImpl()) + : true; - // Sync with the last event for in order queue - if (isInOrder() && !EventImpl->isDiscarded()) { - LastEvent = EventImpl; + EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass); + + // Sync with the last event for in order queue. For scheduler-bypass flow, + // the ordering is done at the layers below the SYCL runtime, + // but for the scheduler-based flow, it needs to be done here, as the + // scheduler handles host task submissions. + if (isInOrder()) { + LastEvent = SchedulerBypass ? nullptr : EventImpl; } // Barrier and un-enqueued commands synchronization for out or order queue - if (!isInOrder() && !EventImpl->isEnqueued()) { + if (!isInOrder() && EventImpl && !EventImpl->isEnqueued()) { MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); } From 91ad6dd9068179b7c10431cd3e233c5139e54ea4 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 29 Sep 2025 09:52:53 +0000 Subject: [PATCH 02/10] Remove unnecessary EventImpl check --- sycl/source/detail/queue_impl.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index fc62f0ab3fd80..c853163a32d1d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -567,7 +567,7 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } // Barrier and un-enqueued commands synchronization for out or order queue - if (!isInOrder() && EventImpl && !EventImpl->isEnqueued()) { + if (!isInOrder() && !EventImpl->isEnqueued()) { MDefaultGraphDeps.UnenqueuedCmdEvents.push_back(EventImpl); } From 4d06579cf19293bee58cc60c139b48ce6ee0646f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 30 Sep 2025 10:38:56 +0000 Subject: [PATCH 03/10] Extract the scheduler bypass logic into a separate function and call it from the handler and handler-less functions --- sycl/source/detail/queue_impl.cpp | 149 +++++++++++++++++++++--------- sycl/source/detail/queue_impl.hpp | 24 ++++- sycl/source/handler.cpp | 99 ++------------------ 3 files changed, 131 insertions(+), 141 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c853163a32d1d..665e924a8344c 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -420,13 +420,110 @@ queue_impl::submit_impl(const detail::type_erased_cgfo_ty &CGF, return EventImpl; } -detail::EventImplPtr queue_impl::submit_kernel_direct_impl( +EventImplPtr queue_impl::submit_kernel_scheduler_bypass( + KernelData &KData, std::vector &DepEvents, + bool EventNeeded, std::shared_ptr &Kernel, + detail::kernel_bundle_impl *KernelBundleImpPtr, + const detail::code_location &CodeLoc, bool IsTopCodeLoc) { + std::vector RawEvents; + + // TODO checking the size of the events vector and avoiding the call is + // more efficient here at this point + if (DepEvents.size() > 0) { + RawEvents = detail::Command::getUrEvents(DepEvents, this, false); + } + + bool DiscardEvent = !EventNeeded && supportsDiscardingPiEvents(); + if (DiscardEvent) { + // Kernel only uses assert if it's non interop one + bool KernelUsesAssert = + !(Kernel && Kernel->isInterop()) && KData.usesAssert(); + DiscardEvent = !KernelUsesAssert; + } + + std::shared_ptr ResultEvent = + DiscardEvent ? nullptr : detail::event_impl::create_device_event(*this); + + auto EnqueueKernel = [&]() { +#ifdef XPTI_ENABLE_INSTRUMENTATION + xpti_td *CmdTraceEvent = nullptr; + uint64_t InstanceID = 0; + auto StreamID = detail::getActiveXPTIStreamID(); + // Only enable instrumentation if there are subscribes to the SYCL + // stream + const bool xptiEnabled = xptiCheckTraceEnabled(StreamID); + if (xptiEnabled) { + std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( + StreamID, Kernel, CodeLoc, IsTopCodeLoc, + *KData.getDeviceKernelInfoPtr(), this, KData.getNDRDesc(), + KernelBundleImpPtr, KData.getArgs()); + detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, + xpti::trace_task_begin, nullptr); + } +#endif + const detail::RTDeviceBinaryImage *BinImage = nullptr; + if (detail::SYCLConfig::get()) { + BinImage = detail::retrieveKernelBinary(*this, KData.getKernelName()); + assert(BinImage && "Failed to obtain a binary image."); + } + enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(), + KernelBundleImpPtr, Kernel.get(), + *KData.getDeviceKernelInfoPtr(), RawEvents, + ResultEvent.get(), nullptr, KData.getKernelCacheConfig(), + KData.isCooperative(), KData.usesClusterLaunch(), + KData.getKernelWorkGroupMemorySize(), BinImage, + KData.getKernelFuncPtr()); +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (xptiEnabled) { + // Emit signal only when event is created + if (!DiscardEvent) { + detail::emitInstrumentationGeneral( + StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal, + static_cast(ResultEvent->getHandle())); + } + detail::emitInstrumentationGeneral(StreamID, InstanceID, CmdTraceEvent, + xpti::trace_task_end, nullptr); + } +#endif + }; + + if (DiscardEvent) { + EnqueueKernel(); + } else { + ResultEvent->setWorkerQueue(weak_from_this()); + ResultEvent->setStateIncomplete(); + ResultEvent->setSubmissionTime(); + + EnqueueKernel(); + ResultEvent->setEnqueued(); + // connect returned event with dependent events + if (!isInOrder()) { + // MEvents is not used anymore, so can move. + ResultEvent->getPreparedDepsEvents() = std::move(DepEvents); + // ResultEvent is local for current thread, no need to lock. + ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); + } + } + + return ResultEvent; +} + +EventImplPtr queue_impl::submit_kernel_direct_impl( const NDRDescT &NDRDesc, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; + detail::code_location CLoc; + bool IsTopCLoc = true; + +#ifdef XPTI_ENABLE_INSTRUMENTATION + if (xptiTraceEnabled()) { + CLoc = CodeLoc; + IsTopCLoc = IsTopCodeLoc; + } +#endif KData.setDeviceKernelInfoPtr(DeviceKernelInfo); KData.setKernelFunc(HostKernel->getPtr()); @@ -435,48 +532,10 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, bool SchedulerBypass) -> EventImplPtr { if (SchedulerBypass) { - bool DiscardEvent = !CallerNeedsEvent && supportsDiscardingPiEvents(); - std::vector RawEvents; - - if (CGData.MEvents.size() > 0) { - RawEvents = detail::Command::getUrEvents(CGData.MEvents, this, false); - } - - std::shared_ptr ResultEvent = - DiscardEvent ? nullptr - : detail::event_impl::create_device_event(*this); - - if (!DiscardEvent) { - ResultEvent->setWorkerQueue(weak_from_this()); - ResultEvent->setStateIncomplete(); - ResultEvent->setSubmissionTime(); - } - - enqueueImpKernel(*this, KData.getNDRDesc(), KData.getArgs(), - nullptr, // KernelBundle - nullptr, // Kernel - *KData.getDeviceKernelInfoPtr(), RawEvents, - ResultEvent.get(), - nullptr, // getMemAllocationFunc - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // KernelIsCooperative - false, // KernelUsesClusterLaunch - 0, // WorkGroupMemorySize - nullptr, // BinImage - KData.getKernelFuncPtr()); - - if (!DiscardEvent) { - ResultEvent->setEnqueued(); - // connect returned event with dependent events - if (!isInOrder()) { - // MEvents is not used anymore, so can move. - ResultEvent->getPreparedDepsEvents() = std::move(CGData.MEvents); - // ResultEvent is local for current thread, no need to lock. - ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); - } - } - - return ResultEvent; + std::shared_ptr Kernel; + return submit_kernel_scheduler_bypass(KData, CGData.MEvents, + CallerNeedsEvent, Kernel, nullptr, + CLoc, IsTopCLoc); } else { std::unique_ptr CommandGroup; std::vector> StreamStorage; @@ -495,8 +554,8 @@ detail::EventImplPtr queue_impl::submit_kernel_direct_impl( false, // KernelIsCooperative false, // KernelUsesClusterLaunch 0, // KernelWorkGroupMemorySize - CodeLoc)); - CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + CLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCLoc; return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), *this, true); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index c3d6748695423..9fad6c1568d19 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -388,6 +388,24 @@ class queue_impl : public std::enable_shared_from_this { submit_impl(CGF, /*CallerNeedsEvent=*/false, Loc, IsTopCodeLoc, SubmitInfo); } + /// Submits a kernel using the scheduler bypass fast path + /// + /// \param KData an object storing data related to the kernel. + /// \param DepEvents list of event dependencies. + /// \param EventNeeded true, if the resulting event is needed. + /// \param Kernel used, if kernel defined as a kernel object. + /// \param KernelBundleImpPtr used, if kernel bundle defined. + /// \param CodeLoc is the code location of the submit call. + /// \param IsTopCodeLoc used to determine if the object is in a local + /// scope or in the top level scope. + /// + /// \return a SYCL event representing submitted command or nullptr. + EventImplPtr submit_kernel_scheduler_bypass( + KernelData &KData, std::vector &DepEvents, + bool EventNeeded, std::shared_ptr &Kernel, + detail::kernel_bundle_impl *KernelBundleImpPtr, + const detail::code_location &CodeLoc, bool IsTopCodeLoc); + /// Performs a blocking wait for the completion of all enqueued tasks in the /// queue. /// @@ -904,15 +922,15 @@ class queue_impl : public std::enable_shared_from_this { /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command group or nullptr. - detail::EventImplPtr submit_kernel_direct_impl( + EventImplPtr submit_kernel_direct_impl( const NDRDescT &NDRDesc, std::shared_ptr &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); template - detail::EventImplPtr submit_direct(bool CallerNeedsEvent, - SubmitCommandFuncType &SubmitCommandFunc); + EventImplPtr submit_direct(bool CallerNeedsEvent, + SubmitCommandFuncType &SubmitCommandFunc); /// Helper function for submitting a memory operation with a handler. /// \param DepEvents is a vector of dependencies of the operation. diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index ffa7d80eda4d0..a5132e76f0e00 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -638,103 +638,16 @@ event handler::finalize() { // the graph is not changed, then this faster path is used to submit // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. - std::vector RawEvents; - // TODO checking the size of the events vector and avoiding the call is - // more efficient here at this point - if (impl->CGData.MEvents.size() > 0) { - RawEvents = detail::Command::getUrEvents( - impl->CGData.MEvents, impl->get_queue_or_null(), false); - } - - bool DiscardEvent = - !impl->MEventNeeded && impl->get_queue().supportsDiscardingPiEvents(); - if (DiscardEvent) { - // Kernel only uses assert if it's non interop one - bool KernelUsesAssert = !(MKernel && MKernel->isInterop()) && - impl->MKernelData.usesAssert(); - DiscardEvent = !KernelUsesAssert; - } - - std::shared_ptr ResultEvent = - DiscardEvent - ? nullptr - : detail::event_impl::create_device_event(impl->get_queue()); - auto EnqueueKernel = [&]() { -#ifdef XPTI_ENABLE_INSTRUMENTATION - xpti_td *CmdTraceEvent = nullptr; - uint64_t InstanceID = 0; - auto StreamID = detail::getActiveXPTIStreamID(); - // Only enable instrumentation if there are subscribes to the SYCL - // stream - const bool xptiEnabled = xptiCheckTraceEnabled(StreamID); - if (xptiEnabled) { - std::tie(CmdTraceEvent, InstanceID) = emitKernelInstrumentationData( - StreamID, MKernel, MCodeLoc, impl->MIsTopCodeLoc, - *impl->MKernelData.getDeviceKernelInfoPtr(), - impl->get_queue_or_null(), impl->MKernelData.getNDRDesc(), - KernelBundleImpPtr, impl->MKernelData.getArgs()); - detail::emitInstrumentationGeneral(StreamID, InstanceID, - CmdTraceEvent, - xpti::trace_task_begin, nullptr); - } -#endif - const detail::RTDeviceBinaryImage *BinImage = nullptr; - if (detail::SYCLConfig::get()) { - BinImage = detail::retrieveKernelBinary(impl->get_queue(), - impl->getKernelName()); - assert(BinImage && "Failed to obtain a binary image."); - } - enqueueImpKernel(impl->get_queue(), impl->MKernelData.getNDRDesc(), - impl->MKernelData.getArgs(), KernelBundleImpPtr, - MKernel.get(), - *impl->MKernelData.getDeviceKernelInfoPtr(), RawEvents, - ResultEvent.get(), nullptr, - impl->MKernelData.getKernelCacheConfig(), - impl->MKernelData.isCooperative(), - impl->MKernelData.usesClusterLaunch(), - impl->MKernelData.getKernelWorkGroupMemorySize(), - BinImage, impl->MKernelData.getKernelFuncPtr()); -#ifdef XPTI_ENABLE_INSTRUMENTATION - if (xptiEnabled) { - // Emit signal only when event is created - if (!DiscardEvent) { - detail::emitInstrumentationGeneral( - StreamID, InstanceID, CmdTraceEvent, xpti::trace_signal, - static_cast(ResultEvent->getHandle())); - } - detail::emitInstrumentationGeneral(StreamID, InstanceID, - CmdTraceEvent, - xpti::trace_task_end, nullptr); - } -#endif - }; - - if (DiscardEvent) { - EnqueueKernel(); - } else { - detail::queue_impl &Queue = impl->get_queue(); - ResultEvent->setWorkerQueue(Queue.weak_from_this()); - ResultEvent->setStateIncomplete(); - ResultEvent->setSubmissionTime(); - - EnqueueKernel(); - ResultEvent->setEnqueued(); - // connect returned event with dependent events - if (!Queue.isInOrder()) { - // MEvents is not used anymore, so can move. - ResultEvent->getPreparedDepsEvents() = - std::move(impl->CGData.MEvents); - // ResultEvent is local for current thread, no need to lock. - ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); - } - } + detail::EventImplPtr EventImpl = + impl->get_queue().submit_kernel_scheduler_bypass( + impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded, + MKernel, KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - return ResultEvent; + return EventImpl; #else return detail::createSyclObjFromImpl( - ResultEvent ? ResultEvent - : detail::event_impl::create_discarded_event()); + EventImpl ? EventImpl : detail::event_impl::create_discarded_event()); #endif } } From 03deefe9cae433e5bcd008d08dfe4244f2594dfa Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Tue, 30 Sep 2025 14:42:29 +0000 Subject: [PATCH 04/10] Change the EventImpl var name back to original --- sycl/source/handler.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index a5132e76f0e00..40806c4155f78 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -639,15 +639,16 @@ event handler::finalize() { // kernel bypassing scheduler and avoiding CommandGroup, Command objects // creation. - detail::EventImplPtr EventImpl = + detail::EventImplPtr ResultEvent = impl->get_queue().submit_kernel_scheduler_bypass( impl->MKernelData, impl->CGData.MEvents, impl->MEventNeeded, MKernel, KernelBundleImpPtr, MCodeLoc, impl->MIsTopCodeLoc); #ifdef __INTEL_PREVIEW_BREAKING_CHANGES - return EventImpl; + return ResultEvent; #else return detail::createSyclObjFromImpl( - EventImpl ? EventImpl : detail::event_impl::create_discarded_event()); + ResultEvent ? ResultEvent + : detail::event_impl::create_discarded_event()); #endif } } From 956d27a31c9b7ec0b26ee639edd633840919423a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Wed, 1 Oct 2025 08:01:28 +0000 Subject: [PATCH 05/10] Address review comments --- sycl/source/detail/queue_impl.cpp | 54 ++++++++++++------------------- 1 file changed, 21 insertions(+), 33 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 665e924a8344c..95f76541250f6 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -515,15 +515,6 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( const detail::code_location &CodeLoc, bool IsTopCodeLoc) { KernelData KData; - detail::code_location CLoc; - bool IsTopCLoc = true; - -#ifdef XPTI_ENABLE_INSTRUMENTATION - if (xptiTraceEnabled()) { - CLoc = CodeLoc; - IsTopCLoc = IsTopCodeLoc; - } -#endif KData.setDeviceKernelInfoPtr(DeviceKernelInfo); KData.setKernelFunc(HostKernel->getPtr()); @@ -535,31 +526,28 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( std::shared_ptr Kernel; return submit_kernel_scheduler_bypass(KData, CGData.MEvents, CallerNeedsEvent, Kernel, nullptr, - CLoc, IsTopCLoc); - } else { - std::unique_ptr CommandGroup; - std::vector> StreamStorage; - std::vector> AuxiliaryResources; - - KData.extractArgsAndReqsFromLambda(); - - CommandGroup.reset(new detail::CGExecKernel( - KData.getNDRDesc(), HostKernel, - nullptr, // Kernel - nullptr, // KernelBundle - std::move(CGData), std::move(KData).getArgs(), - *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), - std::move(AuxiliaryResources), detail::CGType::Kernel, - UR_KERNEL_CACHE_CONFIG_DEFAULT, - false, // KernelIsCooperative - false, // KernelUsesClusterLaunch - 0, // KernelWorkGroupMemorySize - CLoc)); - CommandGroup->MIsTopCodeLoc = IsTopCLoc; - - return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), - *this, true); + CodeLoc, IsTopCodeLoc); } + std::unique_ptr CommandGroup; + std::vector> StreamStorage; + std::vector> AuxiliaryResources; + + KData.extractArgsAndReqsFromLambda(); + + CommandGroup.reset(new detail::CGExecKernel( + KData.getNDRDesc(), HostKernel, + nullptr, // Kernel + nullptr, // KernelBundle + std::move(CGData), std::move(KData).getArgs(), + *KData.getDeviceKernelInfoPtr(), std::move(StreamStorage), + std::move(AuxiliaryResources), detail::CGType::Kernel, + KData.getKernelCacheConfig(), KData.isCooperative(), + KData.usesClusterLaunch(), KData.getKernelWorkGroupMemorySize(), + CodeLoc)); + CommandGroup->MIsTopCodeLoc = IsTopCodeLoc; + + return detail::Scheduler::getInstance().addCG(std::move(CommandGroup), + *this, true); }; return submit_direct(CallerNeedsEvent, SubmitKernelFunc); From f15f842aded7d14f2a153bdb2244e201c31b3725 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 09:17:50 +0000 Subject: [PATCH 06/10] Allocate HostKernel on the scheduler path only --- sycl/source/detail/queue_impl.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index c088c0c0f6ce2..6c52e076d659d 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -516,11 +516,8 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( KernelData KData; - std::shared_ptr HostKernelPtr = - HostKernel.takeOrCopyOwnership(); - KData.setDeviceKernelInfoPtr(DeviceKernelInfo); - KData.setKernelFunc(HostKernelPtr->getPtr()); + KData.setKernelFunc(HostKernel.getPtr()); KData.setNDRDesc(NDRDesc); auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData, @@ -534,6 +531,9 @@ EventImplPtr queue_impl::submit_kernel_direct_impl( std::vector> StreamStorage; std::vector> AuxiliaryResources; + std::shared_ptr HostKernelPtr = + HostKernel.takeOrCopyOwnership(); + KData.extractArgsAndReqsFromLambda(); CommandGroup.reset(new detail::CGExecKernel( From e8dc2296b7edfbed42aecbfa05cefcb7a58ee513 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 09:20:48 +0000 Subject: [PATCH 07/10] Fix formatting --- sycl/source/detail/queue_impl.cpp | 3 +-- sycl/source/detail/queue_impl.hpp | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6c52e076d659d..6c6cef89cc691 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -509,8 +509,7 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass( } EventImplPtr queue_impl::submit_kernel_direct_impl( - const NDRDescT &NDRDesc, - detail::HostKernelRefBase &HostKernel, + const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 764d8545c2429..83d2c7668ad5c 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -921,8 +921,7 @@ class queue_impl : public std::enable_shared_from_this { /// /// \return a SYCL event representing submitted command group or nullptr. EventImplPtr submit_kernel_direct_impl( - const NDRDescT &NDRDesc, - detail::HostKernelRefBase &HostKernel, + const NDRDescT &NDRDesc, detail::HostKernelRefBase &HostKernel, detail::DeviceKernelInfo *DeviceKernelInfo, bool CallerNeedsEvent, const detail::code_location &CodeLoc, bool IsTopCodeLoc); From bcf270f0149425a980da49cb071f855e3b08549f Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 09:36:51 +0000 Subject: [PATCH 08/10] Address review comments --- sycl/source/detail/queue_impl.cpp | 2 +- sycl/source/detail/queue_impl.hpp | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 6c6cef89cc691..d4cb21d7db5aa 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -498,7 +498,7 @@ EventImplPtr queue_impl::submit_kernel_scheduler_bypass( ResultEvent->setEnqueued(); // connect returned event with dependent events if (!isInOrder()) { - // MEvents is not used anymore, so can move. + // DepEvents is not used anymore, so can move. ResultEvent->getPreparedDepsEvents() = std::move(DepEvents); // ResultEvent is local for current thread, no need to lock. ResultEvent->cleanDepEventsThroughOneLevelUnlocked(); diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 83d2c7668ad5c..212c612b5a1d3 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -388,13 +388,13 @@ class queue_impl : public std::enable_shared_from_this { /// Submits a kernel using the scheduler bypass fast path /// - /// \param KData an object storing data related to the kernel. - /// \param DepEvents list of event dependencies. - /// \param EventNeeded true, if the resulting event is needed. - /// \param Kernel used, if kernel defined as a kernel object. - /// \param KernelBundleImpPtr used, if kernel bundle defined. + /// \param KData is an object storing data related to the kernel. + /// \param DepEvents is a list of event dependencies. + /// \param EventNeeded should be true, if the resulting event is needed. + /// \param Kernel to be used, if kernel defined as a kernel object. + /// \param KernelBundleImpPtr to be used, if kernel bundle defined. /// \param CodeLoc is the code location of the submit call. - /// \param IsTopCodeLoc used to determine if the object is in a local + /// \param IsTopCodeLoc is used to determine if the object is in a local /// scope or in the top level scope. /// /// \return a SYCL event representing submitted command or nullptr. From 43b4b3a6f6b9a3af4615c06a7f60819626829130 Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Fri, 3 Oct 2025 13:14:20 +0000 Subject: [PATCH 09/10] Change the LaunchGroupedShortcutMoveKernelNoEvent unit test, to reflect the new logic behind HostKernel construction. --- .../FreeFunctionCommandsEvents.cpp | 36 +++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp index 360bdca27e73f..ea523283064ea 100644 --- a/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp +++ b/sycl/unittests/Extensions/FreeFunctionCommands/FreeFunctionCommandsEvents.cpp @@ -235,14 +235,46 @@ TEST_F(FreeFunctionCommandsEventsTests, TestMoveFunctor::MoveCtorCalls = 0; TestMoveFunctor MoveOnly; + std::mutex CvMutex; + std::condition_variable Cv; + bool ready = false; + + // This kernel submission uses scheduler-bypass path, so the HostKernel + // shouldn't be constructed. + + sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32}, + std::move(MoveOnly)); + + ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 0); + ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + + // Another kernel submission is queued behind a host task, + // to force the scheduler-based submission. In this case, the HostKernel + // should be constructed. + + Queue.submit([&](sycl::handler &CGH) { + CGH.host_task([&] { + std::unique_lock lk(CvMutex); + Cv.wait(lk, [&ready] { return ready; }); + }); + }); + sycl::khr::launch_grouped(Queue, sycl::range<1>{32}, sycl::range<1>{32}, std::move(MoveOnly)); + + { + std::unique_lock lk(CvMutex); + ready = true; + } + Cv.notify_one(); + + Queue.wait(); + // Move ctor for TestMoveFunctor is called during move construction of // HostKernel. Copy ctor is called by InstantiateKernelOnHost, can't delete // it. ASSERT_EQ(TestMoveFunctor::MoveCtorCalls, 1); - - ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{1}); + ASSERT_EQ(counter_urEnqueueKernelLaunch, size_t{2}); } #endif From 36fa3114481c24a8949c95e3a16c1bbcfc600b5a Mon Sep 17 00:00:00 2001 From: "Ptak, Slawomir" Date: Mon, 6 Oct 2025 17:36:00 +0000 Subject: [PATCH 10/10] Use scheduler bypass path only if no graph associated with the queue --- sycl/source/detail/queue_impl.cpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 4134913cce7dd..4d6893067afe9 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -659,10 +659,11 @@ queue_impl::submit_direct(bool CallerNeedsEvent, } bool SchedulerBypass = - CGData.MEvents.size() > 0 - ? detail::Scheduler::areEventsSafeForSchedulerBypass(CGData.MEvents, - getContextImpl()) - : true; + (CGData.MEvents.size() > 0 + ? detail::Scheduler::areEventsSafeForSchedulerBypass( + CGData.MEvents, getContextImpl()) + : true) && + !hasCommandGraph(); EventImplPtr EventImpl = SubmitCommandFunc(CGData, SchedulerBypass);