4545 │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
4646 └──────────┴────────────────────────────────────────────────┴─────────┘
4747
48- ┌───────────────────┬──────────────────────────────┐
49- Prefix │Reset signal event │ Barrier waiting on wait event│
50- └───────────────────┴──────────────────────────────┘
48+ ┌───────────────────┬──────────────┐────────────── ────────────────┐
49+ Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50+ └───────────────────┴──────────────┘────────────── ────────────────┘
5151
5252 ┌─────────────────────────────────────────────┐──────────────┐
53- Suffix │Barrier waiting on sync-point event, │ Reset events │
54- │signalling the UR command-buffer signal event│ │
53+ Suffix │Barrier waiting on sync-point event, │ Query CMD │
54+ │signalling the UR command-buffer signal event│ Timestamps │
5555 └─────────────────────────────────────────────┘──────────────┘
5656
5757 For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
@@ -433,6 +433,10 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
433433
434434 ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
435435 ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
436+ // Dependencies between commands are explicitly enforced by sync points when
437+ // enqueuing. Consequently, relax the command ordering in the command list
438+ // can enable the backend to further optimize the workload
439+ ZeCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
436440
437441 ze_command_list_handle_t ZeCommandList;
438442 // TODO We could optimize this by pooling both Level Zero command-lists and UR
@@ -499,13 +503,6 @@ urCommandBufferFinalizeExp(ur_exp_command_buffer_handle_t CommandBuffer) {
499503 ZE2UR_CALL (zeCommandListAppendEventReset,
500504 (CommandBuffer->ZeCommandList , CommandBuffer->WaitEvent ->ZeEvent ));
501505
502- // Reset the L0 events we use for command-buffer internal sync-points to the
503- // non-signalled state
504- for (auto Event : WaitEventList) {
505- ZE2UR_CALL (zeCommandListAppendEventReset,
506- (CommandBuffer->ZeCommandList , Event));
507- }
508-
509506 // Close the command list and have it ready for dispatch.
510507 ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
511508 return UR_RESULT_SUCCESS;
@@ -899,14 +896,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
899896 // Create command-list to execute before `CommandListPtr` and will signal
900897 // when `EventWaitList` dependencies are complete.
901898 ur_command_list_ptr_t WaitCommandList{};
899+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
900+ false ));
901+
902+ // Create a list of events of all the events that compose the command buffer
903+ // workload.
904+ // This loop also resets the L0 events we use for command-buffer internal
905+ // sync-points to the non-signalled state.
906+ // This is required for multiple submissions.
907+ const size_t NumEvents = CommandBuffer->SyncPoints .size ();
908+ std::vector<ze_event_handle_t > WaitEventList{NumEvents};
909+ for (size_t i = 0 ; i < NumEvents; i++) {
910+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
911+ WaitEventList[i] = ZeEvent;
912+ ZE2UR_CALL (zeCommandListAppendEventReset,
913+ (WaitCommandList->first , ZeEvent));
914+ }
915+
902916 if (NumEventsInWaitList) {
903917 _ur_ze_event_list_t TmpWaitList;
904918 UR_CALL (TmpWaitList.createAndRetainUrZeEventList (
905919 NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));
906920
907- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
908- false , false ))
909-
910921 // Update the WaitList of the Wait Event
911922 // Events are appended to the WaitList if the WaitList is not empty
912923 if (CommandBuffer->WaitEvent ->WaitList .isEmpty ())
@@ -919,9 +930,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
919930 CommandBuffer->WaitEvent ->WaitList .Length ,
920931 CommandBuffer->WaitEvent ->WaitList .ZeEventList ));
921932 } else {
922- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
923- false , false ));
924-
925933 ZE2UR_CALL (zeCommandListAppendSignalEvent,
926934 (WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
927935 }
@@ -943,22 +951,38 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
943951 &(CommandBuffer->SignalEvent ->ZeEvent )));
944952
945953 if ((Queue->Properties & UR_QUEUE_FLAG_PROFILING_ENABLE)) {
946- // We create an additional signal specific to the current execution of the
947- // CommandBuffer. This signal is needed for profiling the execution time
948- // of the CommandBuffer. It waits for the WaitEvent to be signaled
949- // which indicates the start of the CommandBuffer actual execution.
950- // This event is embedded into the Event return to the user to allow
951- // the profiling engine to retrieve it.
952- ur_event_handle_t StartEvent{};
953- UR_CALL (createEventAndAssociateQueue (
954- Queue, &StartEvent, UR_COMMAND_COMMAND_BUFFER_ENQUEUE_EXP,
955- WaitCommandList, false ));
956-
957- ZE2UR_CALL (zeCommandListAppendBarrier,
958- (WaitCommandList->first , StartEvent->ZeEvent , 1 ,
959- &(CommandBuffer->WaitEvent ->ZeEvent )));
960-
961- RetEvent->CommandData = StartEvent;
954+ // Multiple submissions of a command buffer implies that we need to save
955+ // the event timestamps before resubmiting the command buffer. We
956+ // therefore copy the these timestamps in a dedicated USM memory section
957+ // before completing the command buffer execution, and then attach this
958+ // memory to the event returned to users to allow to allow the profiling
959+ // engine to recover these timestamps.
960+ ur_usm_desc_t USMDesc{};
961+ ur_usm_device_desc_t UsmDeviceDesc{};
962+ UsmDeviceDesc.stype = UR_STRUCTURE_TYPE_USM_DEVICE_DESC;
963+ ur_usm_host_desc_t UsmHostDesc{};
964+ UsmHostDesc.stype = UR_STRUCTURE_TYPE_USM_HOST_DESC;
965+ UsmDeviceDesc.pNext = &UsmHostDesc;
966+ USMDesc.pNext = &UsmDeviceDesc;
967+ USMDesc.align = 4 ; // 4byte-aligned
968+
969+ size_t Size = WaitEventList.size () * sizeof (ze_kernel_timestamp_result_t );
970+
971+ struct command_buffer_profiling_t *Profiling =
972+ new command_buffer_profiling_t ();
973+
974+ Profiling->NumEvents = WaitEventList.size ();
975+
976+ urUSMSharedAlloc (RetEvent->Context , CommandBuffer->Device , &USMDesc,
977+ nullptr , Size, (void **)&Profiling->Timestamps );
978+
979+ ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
980+ (SignalCommandList->first , WaitEventList.size (),
981+ WaitEventList.data (), Profiling->Timestamps , 0 ,
982+ RetEvent->ZeEvent , 1 ,
983+ &(CommandBuffer->SignalEvent ->ZeEvent )));
984+
985+ RetEvent->CommandData = static_cast <void *>(Profiling);
962986 }
963987 }
964988
0 commit comments