4545 │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
4646 └──────────┴────────────────────────────────────────────────┴─────────┘
4747
48- ┌───────────────────┬──────────────────────────────┐
49- Prefix │Reset signal event │ Barrier waiting on wait event│
50- └───────────────────┴──────────────────────────────┘
48+ ┌───────────────────┬──────────────┐────────────── ────────────────┐
49+ Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50+ └───────────────────┴──────────────┘────────────── ────────────────┘
5151
5252 ┌─────────────────────────────────────────────┐──────────────┐
53- Suffix │Barrier waiting on sync-point event, │ Reset events │
54- │signalling the UR command-buffer signal event│ │
53+ Suffix │Barrier waiting on sync-point event, │ Query CMD │
54+ │signalling the UR command-buffer signal event│ Timestamps │
5555 └─────────────────────────────────────────────┘──────────────┘
5656
5757 For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
433433
434434 ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
435435 ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
436+ // Dependencies between commands are explicitly enforced by sync points when
437+ // enqueuing. Consequently, relax the command ordering in the command list
438+ // can enable the backend to further optimize the workload
439+ ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
436440
437441 ze_command_list_handle_t ZeCommandList;
438442 // TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -494,18 +498,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
494498 (CommandBuffer->ZeCommandList , CommandBuffer->SignalEvent ->ZeEvent ,
495499 NumEvents, WaitEventList.data ()));
496500
497- // Reset the wait-event for the UR command-buffer that is signalled when its
498- // submission dependencies have been satisfied.
499- ZE2UR_CALL (zeCommandListAppendEventReset,
500- (CommandBuffer->ZeCommandList , CommandBuffer->WaitEvent ->ZeEvent ));
501-
502- // Reset the L0 events we use for command-buffer internal sync-points to the
503- // non-signalled state
504- for (auto Event : WaitEventList) {
505- ZE2UR_CALL (zeCommandListAppendEventReset,
506- (CommandBuffer->ZeCommandList , Event));
507- }
508-
509501 // Close the command list and have it ready for dispatch.
510502 ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
511503 return UR_RESULT_SUCCESS;
@@ -899,14 +891,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
899891 // Create command-list to execute before `CommandListPtr` and will signal
900892 // when `EventWaitList` dependencies are complete.
901893 ur_command_list_ptr_t WaitCommandList{};
894+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
895+ false ));
896+
897+ // Create a list of events of all the events that compose the command buffer
898+ // workload.
899+ // This loop also resets the L0 events we use for command-buffer internal
900+ // sync-points to the non-signalled state.
901+ // This is required for multiple submissions.
902+ const size_t NumEvents = CommandBuffer->SyncPoints .size ();
903+ std::vector<ze_event_handle_t > WaitEventList{NumEvents};
904+ for (size_t i = 0 ; i < NumEvents; i++) {
905+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
906+ WaitEventList[i] = ZeEvent;
907+ ZE2UR_CALL (zeCommandListAppendEventReset,
908+ (WaitCommandList->first , ZeEvent));
909+ }
910+
902911 if (NumEventsInWaitList) {
903912 _ur_ze_event_list_t TmpWaitList;
904913 UR_CALL (TmpWaitList.createAndRetainUrZeEventList (
905914 NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
906915
907- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
908- false , false ))
909-
910916 // Update the WaitList of the Wait Event
911917 // Events are appended to the WaitList if the WaitList is not empty
912918 if (CommandBuffer->WaitEvent ->WaitList .isEmpty ())
@@ -919,9 +925,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
919925 CommandBuffer->WaitEvent ->WaitList .Length ,
920926 CommandBuffer->WaitEvent ->WaitList .ZeEventList ));
921927 } else {
922- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
923- false , false ));
924-
925928 ZE2UR_CALL (zeCommandListAppendSignalEvent,
926929 (WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
927930 }
@@ -930,17 +933,43 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
930933 ur_event_handle_t RetEvent{};
931934 // Create a command-list to signal RetEvent on completion
932935 ur_command_list_ptr_t SignalCommandList{};
933- if (Event) {
934- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
935- false , false ));
936+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
937+ false , false ));
938+ // Reset the wait-event for the UR command-buffer that is signalled when its
939+ // submission dependencies have been satisfied.
940+ ZE2UR_CALL (zeCommandListAppendEventReset,
941+ (SignalCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
936942
943+ if (Event) {
937944 UR_CALL (createEventAndAssociateQueue (Queue, &RetEvent,
938945 UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
939946 SignalCommandList, false ));
940947
941- ZE2UR_CALL (zeCommandListAppendBarrier,
942- (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
943- &(CommandBuffer->SignalEvent ->ZeEvent )));
948+ if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
949+ // Multiple submissions of a command buffer implies that we need to save
950+ // the event timestamps before resubmiting the command buffer. We
951+ // therefore copy the these timestamps in a dedicated USM memory section
952+ // before completing the command buffer execution, and then attach this
953+ // memory to the event returned to users to allow to allow the profiling
954+ // engine to recover these timestamps.
955+ command_buffer_profiling_t *Profiling = new command_buffer_profiling_t ();
956+
957+ Profiling->NumEvents = WaitEventList.size ();
958+ Profiling->Timestamps =
959+ new ze_kernel_timestamp_result_t [Profiling->NumEvents ];
960+
961+ ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
962+ (SignalCommandList->first , WaitEventList.size (),
963+ WaitEventList.data (), (void *)Profiling->Timestamps , 0 ,
964+ RetEvent->ZeEvent , 1 ,
965+ &(CommandBuffer->SignalEvent ->ZeEvent )));
966+
967+ RetEvent->CommandData = static_cast <void *>(Profiling);
968+ } else {
969+ ZE2UR_CALL (zeCommandListAppendBarrier,
970+ (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
971+ &(CommandBuffer->SignalEvent ->ZeEvent )));
972+ }
944973 }
945974
946975 // Execution our command-lists asynchronously
0 commit comments