@@ -326,6 +326,16 @@ void queue_impl::addEvent(const detail::EventImplPtr &EventImpl) {
326326 }
327327}
328328
329+ void queue_impl::addEventUnlocked (const detail::EventImplPtr &EventImpl) {
330+ if (!EventImpl)
331+ return ;
332+ Command *Cmd = EventImpl->getCommand ();
333+ if (Cmd != nullptr && EventImpl->getHandle () == nullptr ) {
334+ std::weak_ptr<event_impl> EventWeakPtr{EventImpl};
335+ MEventsWeak.push_back (std::move (EventWeakPtr));
336+ }
337+ }
338+
329339detail::EventImplPtr
330340queue_impl::submit_impl (const detail::type_erased_cgfo_ty &CGF,
331341 bool CallerNeedsEvent, const detail::code_location &Loc,
@@ -574,16 +584,23 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
574584 KData.validateAndSetKernelLaunchProperties (Props, hasCommandGraph (),
575585 getDeviceImpl ());
576586
577- auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData,
578- bool SchedulerBypass) -> EventImplPtr {
587+ auto SubmitKernelFunc = [&](detail::CG::StorageInitHelper &CGData)
588+ -> std::pair<EventImplPtr, bool > {
589+ bool SchedulerBypass =
590+ (CGData.MEvents .size () > 0
591+ ? detail::Scheduler::areEventsSafeForSchedulerBypass (
592+ CGData.MEvents , getContextImpl ())
593+ : true ) &&
594+ !hasCommandGraph ();
579595 if (SchedulerBypass) {
580596 // No need to copy/move the kernel function, so we set
581597 // the function pointer to the original function
582598 KData.setKernelFunc (HostKernel.getPtr ());
583599
584- return submit_kernel_scheduler_bypass (KData, CGData.MEvents ,
585- CallerNeedsEvent, nullptr , nullptr ,
586- CodeLoc, IsTopCodeLoc);
600+ return {submit_kernel_scheduler_bypass (KData, CGData.MEvents ,
601+ CallerNeedsEvent, nullptr , nullptr ,
602+ CodeLoc, IsTopCodeLoc),
603+ SchedulerBypass};
587604 }
588605 std::unique_ptr<detail::CG> CommandGroup;
589606 std::vector<std::shared_ptr<detail::stream_impl>> StreamStorage;
@@ -611,24 +628,63 @@ EventImplPtr queue_impl::submit_kernel_direct_impl(
611628 CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
612629
613630 if (auto GraphImpl = getCommandGraph (); GraphImpl) {
614- return submit_command_to_graph (*GraphImpl, std::move (CommandGroup),
615- detail::CGType::Kernel);
631+ return {submit_command_to_graph (*GraphImpl, std::move (CommandGroup),
632+ detail::CGType::Kernel),
633+ SchedulerBypass};
616634 }
617635
618- return detail::Scheduler::getInstance ().addCG (std::move (CommandGroup),
619- *this , true );
636+ return {detail::Scheduler::getInstance ().addCG (std::move (CommandGroup),
637+ *this , true ),
638+ SchedulerBypass};
620639 };
621640
622- return submit_direct (CallerNeedsEvent, DepEvents, SubmitKernelFunc);
641+ return submit_direct (CallerNeedsEvent, DepEvents, SubmitKernelFunc,
642+ detail::CGType::Kernel,
643+ /* CommandFuncContainsHostTask*/ false );
644+ }
645+
646+ EventImplPtr queue_impl::submit_graph_direct_impl (
647+ std::shared_ptr<ext::oneapi::experimental::detail::exec_graph_impl>
648+ ExecGraph,
649+ bool CallerNeedsEvent, sycl::span<const event> DepEvents,
650+ const detail::code_location &CodeLoc, bool IsTopCodeLoc) {
651+ bool EventNeeded = CallerNeedsEvent || ExecGraph->containsHostTask () ||
652+ !supportsDiscardingPiEvents ();
653+ auto SubmitGraphFunc = [&](detail::CG::StorageInitHelper CGData)
654+ -> std::pair<EventImplPtr, bool > {
655+ if (auto ParentGraph = getCommandGraph (); ParentGraph) {
656+ std::unique_ptr<detail::CG> CommandGroup;
657+ {
658+ ext::oneapi::experimental::detail::graph_impl::ReadLock ParentLock (
659+ ParentGraph->MMutex );
660+ CGData.MRequirements = ExecGraph->getRequirements ();
661+ // Here we are using the CommandGroup without passing a CommandBuffer to
662+ // pass the exec_graph_impl and event dependencies. Since this subgraph
663+ // CG will not be executed this is fine.
664+ CommandGroup.reset (
665+ new sycl::detail::CGExecCommandBuffer (nullptr , ExecGraph, CGData));
666+ }
667+ CommandGroup->MIsTopCodeLoc = IsTopCodeLoc;
668+ return {submit_command_to_graph (*ParentGraph, std::move (CommandGroup),
669+ detail::CGType::ExecCommandBuffer),
670+ /* BypassScheduler*/ false };
671+ } else {
672+ return ExecGraph->enqueue (*this , CGData, EventNeeded);
673+ }
674+ };
675+ return submit_direct (CallerNeedsEvent, DepEvents, SubmitGraphFunc,
676+ detail::CGType::ExecCommandBuffer,
677+ ExecGraph->containsHostTask ());
623678}
624679
625680template <typename SubmitCommandFuncType>
626- detail::EventImplPtr
627- queue_impl::submit_direct ( bool CallerNeedsEvent,
628- sycl::span< const event> DepEvents ,
629- SubmitCommandFuncType &SubmitCommandFunc ) {
681+ detail::EventImplPtr queue_impl::submit_direct (
682+ bool CallerNeedsEvent, sycl::span< const event> DepEvents ,
683+ SubmitCommandFuncType &SubmitCommandFunc, detail::CGType Type ,
684+ bool CommandFuncContainsHostTask ) {
630685 detail::CG::StorageInitHelper CGData;
631686 std::unique_lock<std::mutex> Lock (MMutex);
687+ const bool inOrder = isInOrder ();
632688
633689 // Used by queue_empty() and getLastEvent()
634690 MEmpty.store (false , std::memory_order_release);
@@ -639,29 +695,35 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
639695 registerEventDependency</* LockQueue*/ false >(
640696 getSyclObjImpl (*ExternalEvent), CGData.MEvents , this , getContextImpl (),
641697 getDeviceImpl (), hasCommandGraph () ? getCommandGraph ().get () : nullptr ,
642- detail::CGType::Kernel );
698+ Type );
643699 }
644700
645701 auto &Deps = hasCommandGraph () ? MExtGraphDeps : MDefaultGraphDeps;
646702
647703 // Sync with the last event for in order queue
648704 EventImplPtr &LastEvent = Deps.LastEventPtr ;
649- if (isInOrder () && LastEvent) {
705+ if (inOrder && LastEvent) {
650706 registerEventDependency</* LockQueue*/ false >(
651707 LastEvent, CGData.MEvents , this , getContextImpl (), getDeviceImpl (),
652- hasCommandGraph () ? getCommandGraph ().get () : nullptr ,
653- detail::CGType::Kernel);
708+ hasCommandGraph () ? getCommandGraph ().get () : nullptr , Type);
709+ } else if (inOrder && MNoLastEventMode && CommandFuncContainsHostTask) {
710+ // If we have a host task in an in-order queue with no last event mode, then
711+ // we must add a barrier to ensure ordering.
712+ auto ResEvent = insertHelperBarrier ();
713+ registerEventDependency</* LockQueue*/ false >(
714+ ResEvent, CGData.MEvents , this , getContextImpl (), getDeviceImpl (),
715+ hasCommandGraph () ? getCommandGraph ().get () : nullptr , Type);
654716 }
655717
656718 for (event e : DepEvents) {
657719 registerEventDependency</* LockQueue*/ false >(
658720 getSyclObjImpl (e), CGData.MEvents , this , getContextImpl (),
659721 getDeviceImpl (), hasCommandGraph () ? getCommandGraph ().get () : nullptr ,
660- detail::CGType::Kernel );
722+ Type );
661723 }
662724
663725 // Barrier and un-enqueued commands synchronization for out or order queue
664- if (!isInOrder () ) {
726+ if (!inOrder ) {
665727 MMissedCleanupRequests.unset (
666728 [&](MissedCleanupRequestsType &MissedCleanupRequests) {
667729 for (auto &UpdatedGraph : MissedCleanupRequests)
@@ -674,31 +736,27 @@ queue_impl::submit_direct(bool CallerNeedsEvent,
674736 }
675737 }
676738
677- bool SchedulerBypass =
678- (CGData.MEvents .size () > 0
679- ? detail::Scheduler::areEventsSafeForSchedulerBypass (
680- CGData.MEvents , getContextImpl ())
681- : true ) &&
682- !hasCommandGraph ();
739+ auto [EventImpl, SchedulerBypass] = SubmitCommandFunc (CGData);
683740
684741 // Synchronize with the "no last event mode", used by the handler-based
685742 // kernel submit path
686- MNoLastEventMode.store (isInOrder () && SchedulerBypass,
687- std::memory_order_relaxed);
688-
689- EventImplPtr EventImpl = SubmitCommandFunc (CGData, SchedulerBypass);
743+ MNoLastEventMode.store (inOrder && SchedulerBypass, std::memory_order_relaxed);
690744
691745 // Sync with the last event for in order queue. For scheduler-bypass flow,
692746 // the ordering is done at the layers below the SYCL runtime,
693747 // but for the scheduler-based flow, it needs to be done here, as the
694748 // scheduler handles host task submissions.
695- if (isInOrder () ) {
749+ if (inOrder ) {
696750 LastEvent = SchedulerBypass ? nullptr : EventImpl;
697751 }
698752
699- // Barrier and un-enqueued commands synchronization for out or order queue
700- if (!isInOrder () && !EventImpl->isEnqueued ()) {
701- Deps.UnenqueuedCmdEvents .push_back (EventImpl);
753+ // Barrier and un-enqueued commands synchronization for out or order queue.
754+ // The event must also be stored for future wait calls.
755+ if (!inOrder) {
756+ if (!EventImpl->isEnqueued ()) {
757+ Deps.UnenqueuedCmdEvents .push_back (EventImpl);
758+ }
759+ addEventUnlocked (EventImpl);
702760 }
703761
704762 return CallerNeedsEvent ? EventImpl : nullptr ;
@@ -1149,6 +1207,15 @@ void queue_impl::verifyProps(const property_list &Props) const {
11491207 CheckPropertiesWithData);
11501208}
11511209
1210+ EventImplPtr queue_impl::insertHelperBarrier () {
1211+ auto ResEvent = detail::event_impl::create_device_event (*this );
1212+ ur_event_handle_t UREvent = nullptr ;
1213+ getAdapter ().call <UrApiKind::urEnqueueEventsWaitWithBarrier>(
1214+ getHandleRef (), 0 , nullptr , &UREvent);
1215+ ResEvent->setHandle (UREvent);
1216+ return ResEvent;
1217+ }
1218+
11521219} // namespace detail
11531220} // namespace _V1
11541221} // namespace sycl
0 commit comments