@@ -43,6 +43,20 @@ static const bool UseMultipleCmdlistBarriers = [] {
4343 return std::atoi (UseMultipleCmdlistBarriersFlag) > 0 ;
4444}();
4545
46+ bool WaitListEmptyOrAllEventsFromSameQueue (
47+ ur_queue_handle_t Queue, uint32_t NumEventsInWaitList,
48+ const ur_event_handle_t *EventWaitList) {
49+ if (!NumEventsInWaitList)
50+ return true ;
51+
52+ for (uint32_t i = 0 ; i < NumEventsInWaitList; ++i) {
53+ if (Queue != EventWaitList[i]->UrQueue )
54+ return false ;
55+ }
56+
57+ return true ;
58+ }
59+
4660UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWait (
4761 ur_queue_handle_t Queue, // /< [in] handle of the queue object
4862 uint32_t NumEventsInWaitList, // /< [in] size of the event wait list
@@ -206,21 +220,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueEventsWaitWithBarrier(
206220 bool IsInternal = OutEvent == nullptr ;
207221 ur_event_handle_t *Event = OutEvent ? OutEvent : &InternalEvent;
208222
209- auto WaitListEmptyOrAllEventsFromSameQueue = [Queue, NumEventsInWaitList,
210- EventWaitList]() {
211- if (!NumEventsInWaitList)
212- return true ;
213-
214- for (uint32_t I = 0 ; I < NumEventsInWaitList; ++I)
215- if (Queue != EventWaitList[I]->UrQueue )
216- return false ;
217-
218- return true ;
219- };
220-
221223 // For in-order queue and wait-list which is empty or has events from
222224 // the same queue just use the last command event as the barrier event.
223- if (Queue->isInOrderQueue () && WaitListEmptyOrAllEventsFromSameQueue () &&
225+ if (Queue->isInOrderQueue () &&
226+ WaitListEmptyOrAllEventsFromSameQueue (Queue, NumEventsInWaitList,
227+ EventWaitList) &&
224228 Queue->LastCommandEvent && !Queue->LastCommandEvent ->IsDiscarded ) {
225229 UR_CALL (urEventRetain (Queue->LastCommandEvent ));
226230 *Event = Queue->LastCommandEvent ;
@@ -1189,6 +1193,23 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
11891193 CurQueue->LastCommandEvent && CurQueue->LastCommandEvent ->IsDiscarded )
11901194 IncludeLastCommandEvent = false ;
11911195
1196+ // If we are using L0 native implementation for handling in-order queues,
1197+ // then we don't need to add the last enqueued event into the waitlist, as
1198+ // the native driver implementation will already ensure in-order semantics.
1199+ // The only exception is when a different immediate command was last used on
1200+ // the same UR Queue.
1201+ if (CurQueue->Device ->useDriverInOrderLists () && CurQueue->isInOrderQueue () &&
1202+ CurQueue->UsingImmCmdLists ) {
1203+ auto QueueGroup = CurQueue->getQueueGroup (UseCopyEngine);
1204+ uint32_t QueueGroupOrdinal, QueueIndex;
1205+ auto NextIndex = QueueGroup.getQueueIndex (&QueueGroupOrdinal, &QueueIndex,
1206+ /* QueryOnly */ true );
1207+ auto NextImmCmdList = QueueGroup.ImmCmdLists [NextIndex];
1208+ IncludeLastCommandEvent &=
1209+ CurQueue->LastUsedCommandList != CurQueue->CommandListMap .end () &&
1210+ NextImmCmdList != CurQueue->LastUsedCommandList ;
1211+ }
1212+
11921213 try {
11931214 uint32_t TmpListLength = 0 ;
11941215
@@ -1205,6 +1226,16 @@ ur_result_t _ur_ze_event_list_t::createAndRetainUrZeEventList(
12051226 this ->UrEventList = new ur_event_handle_t [EventListLength];
12061227 }
12071228
1229+ // For in-order queue and wait-list which is empty or has events only from
1230+ // the same queue then we don't need to wait on any other additional events
1231+ if (CurQueue->Device ->useDriverInOrderLists () &&
1232+ CurQueue->isInOrderQueue () &&
1233+ WaitListEmptyOrAllEventsFromSameQueue (CurQueue, EventListLength,
1234+ EventList)) {
1235+ this ->Length = TmpListLength;
1236+ return UR_RESULT_SUCCESS;
1237+ }
1238+
12081239 if (EventListLength > 0 ) {
12091240 for (uint32_t I = 0 ; I < EventListLength; I++) {
12101241 {
0 commit comments