4545 │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
4646 └──────────┴────────────────────────────────────────────────┴─────────┘
4747
48- ┌───────────────────┬──────────────────────────────┐
49- Prefix │Reset signal event │ Barrier waiting on wait event│
50- └───────────────────┴──────────────────────────────┘
48+ ┌───────────────────┬──────────────┐────────────── ────────────────┐
49+ Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50+ └───────────────────┴──────────────┘────────────── ────────────────┘
5151
5252 ┌─────────────────────────────────────────────┐──────────────┐
53- Suffix │Barrier waiting on sync-point event, │ Reset events │
54- │signalling the UR command-buffer signal event│ │
53+ Suffix │Barrier waiting on sync-point event, │ Query CMD │
54+ │signalling the UR command-buffer signal event│ Timestamps │
5555 └─────────────────────────────────────────────┘──────────────┘
5656
5757 For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -431,6 +431,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
431431
432432 ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
433433 ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
434+ // Dependencies between commands are explicitly enforced by sync points when
435+ // enqueuing. Consequently, relax the command ordering in the command list
436+ // can enable the backend to further optimize the workload
437+ ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
434438
435439 ze_command_list_handle_t ZeCommandList;
436440 // TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -491,18 +495,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
491495 (CommandBuffer->ZeCommandList , CommandBuffer->SignalEvent ->ZeEvent ,
492496 NumEvents, WaitEventList.data ()));
493497
494- // Reset the wait-event for the UR command-buffer that is signalled when its
495- // submission dependencies have been satisfied.
496- ZE2UR_CALL (zeCommandListAppendEventReset,
497- (CommandBuffer->ZeCommandList , CommandBuffer->WaitEvent ->ZeEvent ));
498-
499- // Reset the L0 events we use for command-buffer internal sync-points to the
500- // non-signalled state
501- for (auto Event : WaitEventList) {
502- ZE2UR_CALL (zeCommandListAppendEventReset,
503- (CommandBuffer->ZeCommandList , Event));
504- }
505-
506498 // Close the command list and have it ready for dispatch.
507499 ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
508500 return UR_RESULT_SUCCESS;
@@ -876,6 +868,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
876868
877869 // Create command-list to execute before `CommandListPtr` and will signal
878870 // when `EventWaitList` dependencies are complete.
871+ ur_command_list_ptr_t WaitCommandList{};
872+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
873+ false ));
874+
875+ // Create a list of events of all the events that compose the command buffer
876+ // workload.
877+ // This loop also resets the L0 events we use for command-buffer internal
878+ // sync-points to the non-signalled state.
879+ // This is required for multiple submissions.
880+ const size_t NumEvents = CommandBuffer->SyncPoints .size ();
881+ std::vector<ze_event_handle_t > WaitEventList{NumEvents};
882+ for (size_t i = 0 ; i < NumEvents; i++) {
883+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
884+ WaitEventList[i] = ZeEvent;
885+ ZE2UR_CALL (zeCommandListAppendEventReset,
886+ (WaitCommandList->first , ZeEvent));
887+ }
888+
879889 bool MustSignalWaitEvent = true ;
880890 if (NumEventsInWaitList) {
881891 _ur_ze_event_list_t TmpWaitList;
@@ -890,10 +900,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
890900 CommandBuffer->WaitEvent ->WaitList .insert (TmpWaitList);
891901
892902 if (!CommandBuffer->WaitEvent ->WaitList .isEmpty ()) {
893- ur_command_list_ptr_t WaitCommandList{};
894- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
895- false , false ))
896-
897903 ZE2UR_CALL (zeCommandListAppendBarrier,
898904 (WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ,
899905 CommandBuffer->WaitEvent ->WaitList .Length ,
@@ -916,22 +922,50 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
916922
917923 // Execution event for this enqueue of the UR command-buffer
918924 ur_event_handle_t RetEvent{};
919- if (Event) {
920- // Create a command-list to signal RetEvent on completion
921- ur_command_list_ptr_t SignalCommandList{};
922- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
923- false , false ));
924925
926+ // Create a command-list to signal RetEvent on completion
927+ ur_command_list_ptr_t SignalCommandList{};
928+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
929+ false , false ));
930+ // Reset the wait-event for the UR command-buffer that is signalled when its
931+ // submission dependencies have been satisfied.
932+ ZE2UR_CALL (zeCommandListAppendEventReset,
933+ (SignalCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
934+
935+ if (Event) {
925936 UR_CALL (createEventAndAssociateQueue (Queue, &RetEvent,
926937 UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
927938 SignalCommandList, false , true ));
928939
929- ZE2UR_CALL (zeCommandListAppendBarrier,
930- (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
931- &(CommandBuffer->SignalEvent ->ZeEvent )));
932- Queue->executeCommandList (SignalCommandList, false , false );
940+ if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
941+ // Multiple submissions of a command buffer implies that we need to save
942+ // the event timestamps before resubmiting the command buffer. We
943+ // therefore copy the these timestamps in a dedicated USM memory section
944+ // before completing the command buffer execution, and then attach this
945+ // memory to the event returned to users to allow to allow the profiling
946+ // engine to recover these timestamps.
947+ command_buffer_profiling_t *Profiling = new command_buffer_profiling_t ();
948+
949+ Profiling->NumEvents = WaitEventList.size ();
950+ Profiling->Timestamps =
951+ new ze_kernel_timestamp_result_t [Profiling->NumEvents ];
952+
953+ ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
954+ (SignalCommandList->first , WaitEventList.size (),
955+ WaitEventList.data (), (void *)Profiling->Timestamps , 0 ,
956+ RetEvent->ZeEvent , 1 ,
957+ &(CommandBuffer->SignalEvent ->ZeEvent )));
958+
959+ RetEvent->CommandData = static_cast <void *>(Profiling);
960+ } else {
961+ ZE2UR_CALL (zeCommandListAppendBarrier,
962+ (SignalCommandList->first , RetEvent->ZeEvent , 1 ,
963+ &(CommandBuffer->SignalEvent ->ZeEvent )));
964+ }
933965 }
934966
967+ Queue->executeCommandList (SignalCommandList, false , false );
968+
935969 if (Event) {
936970 *Event = RetEvent;
937971 }
0 commit comments