@@ -283,14 +283,46 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait(
283283 return UR_RESULT_SUCCESS;
284284}
285285
286+ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl (
287+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
288+ ur_event_handle_t *phEvent) {
289+ TRACK_SCOPE_LATENCY (
290+ " ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier" );
291+
292+ std::scoped_lock<ur_shared_mutex> lock (this ->Mutex );
293+
294+ if (!numEventsInWaitList && !phEvent) {
295+ // nop
296+ return UR_RESULT_SUCCESS;
297+ }
298+
299+ auto signalEvent =
300+ getSignalEvent (phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER);
301+ auto [pWaitEvents, numWaitEvents] =
302+ getWaitListView (phEventWaitList, numEventsInWaitList);
303+
304+ ZE2UR_CALL (zeCommandListAppendBarrier,
305+ (handler.commandList .get (), signalEvent->getZeEvent (),
306+ numWaitEvents, pWaitEvents));
307+
308+ return UR_RESULT_SUCCESS;
309+ }
310+
286311ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier (
287312 uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
288313 ur_event_handle_t *phEvent) {
289314 // For in-order queue we don't need a real barrier, just wait for
290315 // requested events in potentially different queues and add a "barrier"
291316 // event signal because it is already guaranteed that previous commands
292- // in this queue are completed when the signal is started.
293- return enqueueEventsWait (numEventsInWaitList, phEventWaitList, phEvent);
317+ // in this queue are completed when the signal is started. However, we do
318+ // need to use barrier if profiling is enabled: see
319+ // zeCommandListAppendWaitOnEvents
320+ if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0 ) {
321+ return enqueueEventsWaitWithBarrierImpl (numEventsInWaitList,
322+ phEventWaitList, phEvent);
323+ } else {
324+ return enqueueEventsWait (numEventsInWaitList, phEventWaitList, phEvent);
325+ }
294326}
295327
296328ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierExt (
@@ -757,8 +789,8 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch(
757789 getWaitListView (phEventWaitList, numEventsInWaitList);
758790
759791 if (pWaitEvents) {
760- ZE2UR_CALL (zeCommandListAppendBarrier, (handler. commandList . get (), nullptr ,
761- numWaitEvents, pWaitEvents));
792+ ZE2UR_CALL (zeCommandListAppendWaitOnEvents ,
793+ (handler. commandList . get (), numWaitEvents, pWaitEvents));
762794 }
763795 // TODO: figure out how to translate "flags"
764796 ZE2UR_CALL (zeCommandListAppendMemoryPrefetch,
@@ -789,8 +821,8 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size,
789821 auto [pWaitEvents, numWaitEvents] = getWaitListView (nullptr , 0 );
790822
791823 if (pWaitEvents) {
792- ZE2UR_CALL (zeCommandListAppendBarrier, (handler. commandList . get (), nullptr ,
793- numWaitEvents, pWaitEvents));
824+ ZE2UR_CALL (zeCommandListAppendWaitOnEvents ,
825+ (handler. commandList . get (), numWaitEvents, pWaitEvents));
794826 }
795827
796828 // TODO: figure out how to translate "flags"
0 commit comments