@@ -435,9 +435,300 @@ ur_result_t urEnqueueEventsWaitWithBarrierExt(
435435 *OutEvent // /< [in,out][optional] return an event object that identifies
436436 // /< this particular command instance.
437437) {
438- return ur::level_zero::urEnqueueEventsWaitWithBarrier (
439- Queue, NumEventsInWaitList, EventWaitList, OutEvent);
438+ bool InterruptBased =
439+ EnqueueExtProp &&
440+ (EnqueueExtProp->flags & UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS);
441+ if (!InterruptBased) {
442+ return ur::level_zero::urEnqueueEventsWaitWithBarrier (
443+ Queue, NumEventsInWaitList, EventWaitList, OutEvent);
444+ }
445+ // Lock automatically releases when this goes out of scope.
446+ std::scoped_lock<ur_shared_mutex> lock (Queue->Mutex );
447+
448+ // Helper function for appending a barrier to a command list.
449+ auto insertBarrierIntoCmdList = [&Queue](ur_command_list_ptr_t CmdList,
450+ _ur_ze_event_list_t &EventWaitList,
451+ ur_event_handle_t &Event,
452+ bool IsInternal) {
453+ UR_CALL (createEventAndAssociateQueue (
454+ Queue, &Event, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, IsInternal,
455+ false , std::nullopt , true ));
456+ Event->WaitList = EventWaitList;
457+
458+ // For in-order queue we don't need a real barrier, just wait for
459+ // requested events in potentially different queues and add a "barrier"
460+ // event signal because it is already guaranteed that previous commands
461+ // in this queue are completed when the signal is started.
462+ //
463+ // Only consideration here is that when profiling is used, signalEvent
464+ // cannot be used if EventWaitList.Length == 0. In those cases, we need
465+ // to fallback directly to barrier to have correct timestamps. See here:
466+ // https://spec.oneapi.io/level-zero/latest/core/api.html?highlight=appendsignalevent#_CPPv430zeCommandListAppendSignalEvent24ze_command_list_handle_t17ze_event_handle_t
467+ //
468+ // TODO: this and other special handling of in-order queues to be
469+ // updated when/if Level Zero adds native support for in-order queues.
470+ //
471+ if (Queue->isInOrderQueue () && InOrderBarrierBySignal &&
472+ !Queue->isProfilingEnabled ()) {
473+ if (EventWaitList.Length ) {
474+ if (CmdList->second .IsInOrderList ) {
475+ for (unsigned i = EventWaitList.Length ; i-- > 0 ;) {
476+ // If the event is a multidevice event, then given driver in order
477+ // lists, we cannot include this into the wait event list due to
478+ // driver limitations.
479+ if (EventWaitList.UrEventList [i]->IsMultiDevice ) {
480+ EventWaitList.Length --;
481+ if (EventWaitList.Length != i) {
482+ std::swap (EventWaitList.UrEventList [i],
483+ EventWaitList.UrEventList [EventWaitList.Length ]);
484+ std::swap (EventWaitList.ZeEventList [i],
485+ EventWaitList.ZeEventList [EventWaitList.Length ]);
486+ }
487+ }
488+ }
489+ }
490+ ZE2UR_CALL (
491+ zeCommandListAppendWaitOnEvents,
492+ (CmdList->first , EventWaitList.Length , EventWaitList.ZeEventList ));
493+ }
494+ ZE2UR_CALL (zeCommandListAppendSignalEvent,
495+ (CmdList->first , Event->ZeEvent ));
496+ } else {
497+ ZE2UR_CALL (zeCommandListAppendBarrier,
498+ (CmdList->first , Event->ZeEvent , EventWaitList.Length ,
499+ EventWaitList.ZeEventList ));
500+ }
501+
502+ return UR_RESULT_SUCCESS;
503+ };
504+
505+ // If the queue is in-order then each command in it effectively acts as a
506+ // barrier, so we don't need to do anything except if we were requested
507+ // a "barrier" event to be created. Or if we need to wait for events in
508+ // potentially different queues.
509+ //
510+ if (Queue->isInOrderQueue () && NumEventsInWaitList == 0 && !OutEvent) {
511+ return UR_RESULT_SUCCESS;
512+ }
513+
514+ ur_event_handle_t ResultEvent = nullptr ;
515+ bool IsInternal = OutEvent == nullptr ;
516+ // For in-order queue and wait-list which is empty or has events from
517+ // the same queue just use the last command event as the barrier event.
518+ // This optimization is disabled when profiling is enabled to ensure
519+ // accurate profiling values & the overhead that profiling incurs.
520+ if (Queue->isInOrderQueue () && !Queue->isProfilingEnabled () &&
521+ WaitListEmptyOrAllEventsFromSameQueue (Queue, NumEventsInWaitList,
522+ EventWaitList) &&
523+ Queue->LastCommandEvent && !Queue->LastCommandEvent ->IsDiscarded ) {
524+ UR_CALL (ur::level_zero::urEventRetain (Queue->LastCommandEvent ));
525+ ResultEvent = Queue->LastCommandEvent ;
526+ if (OutEvent) {
527+ *OutEvent = ResultEvent;
528+ }
529+ return UR_RESULT_SUCCESS;
530+ }
531+
532+ // Indicator for whether batching is allowed. This may be changed later in
533+ // this function, but allow it by default.
534+ bool OkToBatch = true ;
535+
536+ // If we have a list of events to make the barrier from, then we can create a
537+ // barrier on these and use the resulting event as our future barrier.
538+ // We use the same approach if
539+ // UR_L0_USE_MULTIPLE_COMMANDLIST_BARRIERS is not set to a
540+ // positive value.
541+ // We use the same approach if we have in-order queue because every command
542+ // depends on previous one, so we don't need to insert barrier to multiple
543+ // command lists.
544+ if (NumEventsInWaitList || !UseMultipleCmdlistBarriers ||
545+ Queue->isInOrderQueue ()) {
546+ // Retain the events as they will be owned by the result event.
547+ _ur_ze_event_list_t TmpWaitList;
548+ UR_CALL (TmpWaitList.createAndRetainUrZeEventList (
549+ NumEventsInWaitList, EventWaitList, Queue, false /* UseCopyEngine=*/ ));
550+
551+ // Get an arbitrary command-list in the queue.
552+ ur_command_list_ptr_t CmdList;
553+ UR_CALL (Queue->Context ->getAvailableCommandList (
554+ Queue, CmdList, false /* UseCopyEngine=*/ , NumEventsInWaitList,
555+ EventWaitList, OkToBatch, nullptr /* ForcedCmdQueue*/ ));
556+
557+ // Insert the barrier into the command-list and execute.
558+ UR_CALL (insertBarrierIntoCmdList (CmdList, TmpWaitList, ResultEvent,
559+ IsInternal));
560+
561+ UR_CALL (
562+ Queue->executeCommandList (CmdList, false /* IsBlocking*/ , OkToBatch));
563+
564+ // Because of the dependency between commands in the in-order queue we don't
565+ // need to keep track of any active barriers if we have in-order queue.
566+ if (UseMultipleCmdlistBarriers && !Queue->isInOrderQueue ()) {
567+ auto UREvent = reinterpret_cast <ur_event_handle_t >(ResultEvent);
568+ Queue->ActiveBarriers .add (UREvent);
569+ }
570+
571+ if (OutEvent) {
572+ *OutEvent = ResultEvent;
573+ }
574+ return UR_RESULT_SUCCESS;
575+ }
576+
577+ // Since there are no events to explicitly create a barrier for, we are
578+ // inserting a queue-wide barrier.
579+
580+ // Command list(s) for putting barriers.
581+ std::vector<ur_command_list_ptr_t > CmdLists;
582+
583+ // There must be at least one L0 queue.
584+ auto &ComputeGroup = Queue->ComputeQueueGroupsByTID .get ();
585+ auto &CopyGroup = Queue->CopyQueueGroupsByTID .get ();
586+ UR_ASSERT (!ComputeGroup.ZeQueues .empty () || !CopyGroup.ZeQueues .empty (),
587+ UR_RESULT_ERROR_INVALID_QUEUE);
588+
589+ size_t NumQueues = 0 ;
590+ for (auto &QueueMap :
591+ {Queue->ComputeQueueGroupsByTID , Queue->CopyQueueGroupsByTID })
592+ for (auto &QueueGroup : QueueMap)
593+ NumQueues += QueueGroup.second .ZeQueues .size ();
594+
595+ OkToBatch = true ;
596+ // Get an available command list tied to each command queue. We need
597+ // these so a queue-wide barrier can be inserted into each command
598+ // queue.
599+ CmdLists.reserve (NumQueues);
600+ for (auto &QueueMap :
601+ {Queue->ComputeQueueGroupsByTID , Queue->CopyQueueGroupsByTID })
602+ for (auto &QueueGroup : QueueMap) {
603+ bool UseCopyEngine =
604+ QueueGroup.second .Type != ur_queue_handle_t_::queue_type::Compute;
605+ if (Queue->UsingImmCmdLists ) {
606+ // If immediate command lists are being used, each will act as their own
607+ // queue, so we must insert a barrier into each.
608+ for (auto &ImmCmdList : QueueGroup.second .ImmCmdLists )
609+ if (ImmCmdList != Queue->CommandListMap .end ())
610+ CmdLists.push_back (ImmCmdList);
611+ } else {
612+ for (auto ZeQueue : QueueGroup.second .ZeQueues ) {
613+ if (ZeQueue) {
614+ ur_command_list_ptr_t CmdList;
615+ UR_CALL (Queue->Context ->getAvailableCommandList (
616+ Queue, CmdList, UseCopyEngine, NumEventsInWaitList,
617+ EventWaitList, OkToBatch, &ZeQueue));
618+ CmdLists.push_back (CmdList);
619+ }
620+ }
621+ }
622+ }
623+
624+ // If no activity has occurred on the queue then there will be no cmdlists.
625+ // We need one for generating an Event, so create one.
626+ if (CmdLists.size () == 0 ) {
627+ // Get any available command list.
628+ ur_command_list_ptr_t CmdList;
629+ UR_CALL (Queue->Context ->getAvailableCommandList (
630+ Queue, CmdList, false /* UseCopyEngine=*/ , NumEventsInWaitList,
631+ EventWaitList, OkToBatch, nullptr /* ForcedCmdQueue*/ ));
632+ CmdLists.push_back (CmdList);
633+ }
634+
635+ if (CmdLists.size () > 1 ) {
636+ // Insert a barrier into each unique command queue using the available
637+ // command-lists.
638+ std::vector<ur_event_handle_t > EventWaitVector (CmdLists.size ());
639+ for (size_t I = 0 ; I < CmdLists.size (); ++I) {
640+ _ur_ze_event_list_t waitlist;
641+ UR_CALL (insertBarrierIntoCmdList (
642+ CmdLists[I], waitlist, EventWaitVector[I], true /* IsInternal*/ ));
643+ }
644+ // If there were multiple queues we need to create a "convergence" event to
645+ // be our active barrier. This convergence event is signalled by a barrier
646+ // on all the events from the barriers we have inserted into each queue.
647+ // Use the first command list as our convergence command list.
648+ ur_command_list_ptr_t &ConvergenceCmdList = CmdLists[0 ];
649+
650+ // Create an event list. It will take ownership over all relevant events so
651+ // we relinquish ownership and let it keep all events it needs.
652+ _ur_ze_event_list_t BaseWaitList;
653+ UR_CALL (BaseWaitList.createAndRetainUrZeEventList (
654+ EventWaitVector.size (),
655+ reinterpret_cast <const ur_event_handle_t *>(EventWaitVector.data ()),
656+ Queue, ConvergenceCmdList->second .isCopy (Queue)));
657+
658+ // Insert a barrier with the events from each command-queue into the
659+ // convergence command list. The resulting event signals the convergence of
660+ // all barriers.
661+ UR_CALL (insertBarrierIntoCmdList (ConvergenceCmdList, BaseWaitList,
662+ ResultEvent, IsInternal));
663+ } else {
664+ // If there is only a single queue then insert a barrier and the single
665+ // result event can be used as our active barrier and used as the return
666+ // event. Take into account whether output event is discarded or not.
667+ _ur_ze_event_list_t waitlist;
668+ UR_CALL (insertBarrierIntoCmdList (CmdLists[0 ], waitlist, ResultEvent,
669+ IsInternal));
670+ }
671+
672+ // Execute each command list so the barriers can be encountered.
673+ for (ur_command_list_ptr_t &CmdList : CmdLists) {
674+ bool IsCopy =
675+ CmdList->second .isCopy (reinterpret_cast <ur_queue_handle_t >(Queue));
676+ const auto &CommandBatch =
677+ (IsCopy) ? Queue->CopyCommandBatch : Queue->ComputeCommandBatch ;
678+ // Only batch if the matching CmdList is already open.
679+ OkToBatch = CommandBatch.OpenCommandList == CmdList;
680+
681+ UR_CALL (
682+ Queue->executeCommandList (CmdList, false /* IsBlocking*/ , OkToBatch));
683+ }
684+
685+ UR_CALL (Queue->ActiveBarriers .clear ());
686+ Queue->ActiveBarriers .add (ResultEvent);
687+ if (OutEvent) {
688+ *OutEvent = ResultEvent;
689+ }
690+ return UR_RESULT_SUCCESS;
440691}
692+ /*
693+ ur_result_t urEnqueueEventsWaitWithBarrierExt(
694+ ur_queue_handle_t Queue, ///< [in] handle of the queue object
695+ const ur_exp_enqueue_ext_properties_t
696+ *EnqueueExtProp, ///< [in][optional] pointer to the extended enqueue
697+ properties uint32_t NumEventsInWaitList, ///< [in] size of the event wait list
698+ const ur_event_handle_t
699+ *EventWaitList, ///< [in][optional][range(0, numEventsInWaitList)]
700+ ///< pointer to a list of events that must be complete
701+ ///< before this command can be executed. If nullptr,
702+ ///< the numEventsInWaitList must be 0, indicating that
703+ ///< all previously enqueued commands must be complete.
704+ ur_event_handle_t
705+ *OutEvent ///< [in,out][optional] return an event object that identifies
706+ ///< this particular command instance.
707+ ) {
708+ bool InterruptBased = EnqueueExtProp && (EnqueueExtProp->flags &
709+ UR_EXP_ENQUEUE_EXT_FLAG_LOW_POWER_EVENTS); ur_event_handle_t ResultEvent =
710+ nullptr;
711+
712+ if (InterruptBased) {
713+ // Create the event with interrupt-based properties
714+ ur_command_list_ptr_t CmdList;
715+ UR_CALL(Queue->Context->getAvailableCommandList(Queue, CmdList, false,
716+ NumEventsInWaitList, EventWaitList, true, nullptr));
717+ UR_CALL(createEventAndAssociateQueue(Queue, &ResultEvent,
718+ UR_COMMAND_EVENTS_WAIT_WITH_BARRIER, CmdList, true, false, std::nullopt,
719+ InterruptBased));
720+ }
721+
722+ ur_result_t result = ur::level_zero::urEnqueueEventsWaitWithBarrier(
723+ Queue, NumEventsInWaitList, EventWaitList, OutEvent);
724+
725+ if (InterruptBased && OutEvent) {
726+ *OutEvent = ResultEvent;
727+ }
728+ return result;
729+ }
730+
731+ */
441732
442733ur_result_t urEventGetInfo (
443734 ur_event_handle_t Event, // /< [in] handle of the event object
0 commit comments