1010#include " command_buffer.hpp"
1111#include " ur_level_zero.hpp"
1212
13- /* Command-buffer Extension
14-
15- The UR interface for submitting a UR command-buffer takes a list
16- of events to wait on, and returns an event representing the completion of
17- that particular submission of the command-buffer.
18-
19- However, in `zeCommandQueueExecuteCommandLists` there are no parameters to
20- take a waitlist and also the only sync primitive returned is to block on
21- host.
22-
23- In order to get the UR command-buffer enqueue semantics we want with L0
24- this adapter adds extra commands to the L0 command-list representing a
25- UR command-buffer.
26-
27- Prefix - Commands added to the start of the L0 command-list by L0 adapter.
28- Suffix - Commands added to the end of the L0 command-list by L0 adapter.
29-
30- These extra commands operate on L0 event synchronisation primitives used by
31- the command-list to interact with the external UR wait-list and UR return
32- event required for the enqueue interface.
33-
34- The `ur_exp_command_buffer_handle_t` class for this adapter contains a
35- SignalEvent which signals the completion of the command-list in the suffix,
36- and is reset in the prefix. This signal is detected by a new UR return event
37- created on UR command-buffer enqueue.
38-
39- There is also a WaitEvent used by the `ur_exp_command_buffer_handle_t` class
40- in the prefix to wait on any dependencies passed in the enqueue wait-list.
41- This WaitEvent is reset at the end of the suffix, along with reset commands
42- to reset the L0 events used to implement the UR sync-points.
43-
44- ┌──────────┬────────────────────────────────────────────────┬─────────┐
45- │ Prefix │ Commands added to UR command-buffer by UR user │ Suffix │
46- └──────────┴────────────────────────────────────────────────┴─────────┘
47-
48- ┌───────────────────┬──────────────┐──────────────────────────────┐
49- Prefix │Reset signal event │ Reset events │ Barrier waiting on wait event│
50- └───────────────────┴──────────────┘──────────────────────────────┘
51-
52- ┌─────────────────────────────────────────────┐──────────────┐
53- Suffix │Barrier waiting on sync-point event, │ Query CMD │
54- │signaling the UR command-buffer signal event │ Timestamps │
55- └─────────────────────────────────────────────┘──────────────┘
56-
57- For a call to `urCommandBufferEnqueueExp` with an event_list `EL`,
58- command-buffer `CB`, and return event `RE` our implementation has to create
59- and submit two new command-lists for the above approach to work. One before
60- the command-list with extra commands associated with `CB`, and the other
61- after `CB`.
62-
63- Command-list created on `urCommandBufferEnqueueExp` to execution before `CB`:
64- ┌───────────────────────────────────────────────────────────┐
65- │Barrier on `EL` than signals `CB` WaitEvent when completed │
66- └───────────────────────────────────────────────────────────┘
67-
68- Command-list created on `urCommandBufferEnqueueExp` to execution after `CB`:
69- ┌─────────────────────────────────────────────────────────────┐
70- │Barrier on `CB` SignalEvent that signals `RE` when completed │
71- └─────────────────────────────────────────────────────────────┘
72-
73- Drawbacks
74- ---------
75-
76- There are two drawbacks to this approach:
77-
78- 1. We use 3x the command-list resources, if there are many UR command-buffers
79- in flight, this may exhaust L0 driver resources.
80-
81- 2. Each command list is submitted individually with a
82- `ur_queue_handle_t_::executeCommandList` call which introduces serialization in
83- the submission pipeline that is heavier than having a barrier or a
84- waitForEvents on the same list. Resulting in additional latency when executing
85- graphs.
86-
13+ /* L0 Command-buffer Extension Doc see:
14+ https://github.com/intel/llvm/blob/sycl/sycl/doc/design/CommandGraph.md#level-zero
8715*/
8816
8917ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_ (
9018 ur_context_handle_t Context, ur_device_handle_t Device,
9119 ze_command_list_handle_t CommandList,
20+ ze_command_list_handle_t CommandListResetEvents,
9221 ZeStruct<ze_command_list_desc_t > ZeDesc,
9322 const ur_exp_command_buffer_desc_t *Desc)
9423 : Context(Context), Device(Device), ZeCommandList(CommandList),
24+ ZeCommandListResetEvents(CommandListResetEvents),
9525 ZeCommandListDesc(ZeDesc), ZeFencesList(), QueueProperties(),
9626 SyncPoints(), NextSyncPoint(0 ) {
9727 (void )Desc;
@@ -114,6 +44,12 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
11444 ZE_CALL_NOCHECK (zeCommandListDestroy, (ZeCommandList));
11545 }
11646
47+ // Release the memory allocated to the CommandListResetEvents stored in the
48+ // command_buffer
49+ if (ZeCommandListResetEvents) {
50+ ZE_CALL_NOCHECK (zeCommandListDestroy, (ZeCommandListResetEvents));
51+ }
52+
11753 // Release additional signal and wait events used by command_buffer
11854 if (SignalEvent) {
11955 CleanupCompletedEvent (SignalEvent, false );
@@ -123,6 +59,10 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
12359 CleanupCompletedEvent (WaitEvent, false );
12460 urEventReleaseInternal (WaitEvent);
12561 }
62+ if (AllResetEvent) {
63+ CleanupCompletedEvent (AllResetEvent, false );
64+ urEventReleaseInternal (AllResetEvent);
65+ }
12666
12767 // Release events added to the command_buffer
12868 for (auto &Sync : SyncPoints) {
@@ -434,6 +374,13 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
434374
435375 ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
436376 ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
377+
378+ ze_command_list_handle_t ZeCommandListResetEvents;
379+ // Create a command-list for reseting the events associated to enqueued cmd.
380+ ZE2UR_CALL (zeCommandListCreate,
381+ (Context->ZeContext , Device->ZeDevice , &ZeCommandListDesc,
382+ &ZeCommandListResetEvents));
383+
437384 // Dependencies between commands are explicitly enforced by sync points when
438385 // enqueuing. Consequently, relax the command ordering in the command list
439386 // can enable the backend to further optimize the workload
@@ -446,7 +393,8 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
446393 &ZeCommandListDesc, &ZeCommandList));
447394 try {
448395 *CommandBuffer = new ur_exp_command_buffer_handle_t_ (
449- Context, Device, ZeCommandList, ZeCommandListDesc, CommandBufferDesc);
396+ Context, Device, ZeCommandList, ZeCommandListResetEvents,
397+ ZeCommandListDesc, CommandBufferDesc);
450398 } catch (const std::bad_alloc &) {
451399 return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
452400 } catch (...) {
@@ -460,13 +408,19 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
460408 &RetCommandBuffer->SignalEvent ));
461409 UR_CALL (EventCreate (Context, nullptr , false , false ,
462410 &RetCommandBuffer->WaitEvent ));
411+ UR_CALL (EventCreate (Context, nullptr , false , false ,
412+ &RetCommandBuffer->AllResetEvent ));
463413
464414 // Add prefix commands
465- ZE2UR_CALL (zeCommandListAppendEventReset,
466- (ZeCommandList, RetCommandBuffer->SignalEvent ->ZeEvent ));
415+ ZE2UR_CALL (
416+ zeCommandListAppendEventReset,
417+ (ZeCommandListResetEvents, RetCommandBuffer->SignalEvent ->ZeEvent ));
418+ std::vector<ze_event_handle_t > PrecondEvents = {
419+ RetCommandBuffer->WaitEvent ->ZeEvent ,
420+ RetCommandBuffer->AllResetEvent ->ZeEvent };
467421 ZE2UR_CALL (
468422 zeCommandListAppendBarrier,
469- (ZeCommandList, nullptr , 1 , &RetCommandBuffer-> WaitEvent -> ZeEvent ));
423+ (ZeCommandList, nullptr , PrecondEvents. size (), PrecondEvents. data () ));
470424 return UR_RESULT_SUCCESS;
471425}
472426
@@ -488,20 +442,29 @@ urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t CommandBuffer) {
488442UR_APIEXPORT ur_result_t UR_APICALL
489443urCommandBufferFinalizeExp (ur_exp_command_buffer_handle_t CommandBuffer) {
490444 // Create a list of events for our signal event to wait on
445+ // This loop also resets the L0 events we use for command-buffer internal
446+ // sync-points to the non-signaled state.
447+ // This is required for multiple submissions.
491448 const size_t NumEvents = CommandBuffer->SyncPoints .size ();
492- std::vector<ze_event_handle_t > WaitEventList{NumEvents};
493449 for (size_t i = 0 ; i < NumEvents; i++) {
494- WaitEventList[i] = CommandBuffer->SyncPoints [i]->ZeEvent ;
450+ auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
451+ CommandBuffer->ZeEventsList .push_back (ZeEvent);
452+ ZE2UR_CALL (zeCommandListAppendEventReset,
453+ (CommandBuffer->ZeCommandListResetEvents , ZeEvent));
495454 }
455+ ZE2UR_CALL (zeCommandListAppendSignalEvent,
456+ (CommandBuffer->ZeCommandListResetEvents ,
457+ CommandBuffer->AllResetEvent ->ZeEvent ));
496458
497459 // Wait for all the user added commands to complete, and signal the
498460 // command-buffer signal-event when they are done.
499461 ZE2UR_CALL (zeCommandListAppendBarrier,
500462 (CommandBuffer->ZeCommandList , CommandBuffer->SignalEvent ->ZeEvent ,
501- NumEvents, WaitEventList .data ()));
463+ NumEvents, CommandBuffer-> ZeEventsList .data ()));
502464
503- // Close the command list and have it ready for dispatch.
465+ // Close the command lists and have them ready for dispatch.
504466 ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandList ));
467+ ZE2UR_CALL (zeCommandListClose, (CommandBuffer->ZeCommandListResetEvents ));
505468 return UR_RESULT_SUCCESS;
506469}
507470
@@ -875,26 +838,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
875838 ZE2UR_CALL (zeFenceCreate, (ZeCommandQueue, &ZeFenceDesc, &ZeFence));
876839 CommandBuffer->ZeFencesList .push_back (ZeFence);
877840
878- // Create command-list to execute before `CommandListPtr` and will signal
879- // when `EventWaitList` dependencies are complete.
880- ur_command_list_ptr_t WaitCommandList{};
881- UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList, false ,
882- false ));
883-
884- // Create a list of events of all the events that compose the command buffer
885- // workload.
886- // This loop also resets the L0 events we use for command-buffer internal
887- // sync-points to the non-signaled state.
888- // This is required for multiple submissions.
889- const size_t NumEvents = CommandBuffer->SyncPoints .size ();
890- std::vector<ze_event_handle_t > WaitEventList{NumEvents};
891- for (size_t i = 0 ; i < NumEvents; i++) {
892- auto ZeEvent = CommandBuffer->SyncPoints [i]->ZeEvent ;
893- WaitEventList[i] = ZeEvent;
894- ZE2UR_CALL (zeCommandListAppendEventReset,
895- (WaitCommandList->first , ZeEvent));
896- }
897-
898841 bool MustSignalWaitEvent = true ;
899842 if (NumEventsInWaitList) {
900843 _ur_ze_event_list_t TmpWaitList;
@@ -909,18 +852,31 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
909852 CommandBuffer->WaitEvent ->WaitList .insert (TmpWaitList);
910853
911854 if (!CommandBuffer->WaitEvent ->WaitList .isEmpty ()) {
855+ // Create command-list to execute before `CommandListPtr` and will signal
856+ // when `EventWaitList` dependencies are complete.
857+ ur_command_list_ptr_t WaitCommandList{};
858+ UR_CALL (Queue->Context ->getAvailableCommandList (Queue, WaitCommandList,
859+ false , false ));
860+
912861 ZE2UR_CALL (zeCommandListAppendBarrier,
913862 (WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ,
914863 CommandBuffer->WaitEvent ->WaitList .Length ,
915864 CommandBuffer->WaitEvent ->WaitList .ZeEventList ));
865+ Queue->executeCommandList (WaitCommandList, false , false );
916866 MustSignalWaitEvent = false ;
917867 }
918868 }
919869 if (MustSignalWaitEvent) {
920- ZE2UR_CALL (zeCommandListAppendSignalEvent,
921- (WaitCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
870+ ZE2UR_CALL (zeEventHostSignal, (CommandBuffer->WaitEvent ->ZeEvent ));
922871 }
923- Queue->executeCommandList (WaitCommandList, false , false );
872+
873+ // Submit reset events command-list. This command-list is of a batch
874+ // command-list type, regardless of the UR Queue type. We therefore need to
875+ // submit the list directly using the Level-Zero API to avoid type mismatches
876+ // if using UR functions.
877+ ZE2UR_CALL (
878+ zeCommandQueueExecuteCommandLists,
879+ (ZeCommandQueue, 1 , &CommandBuffer->ZeCommandListResetEvents , nullptr ));
924880
925881 // Submit main command-list. This command-list is of a batch command-list
926882 // type, regardless of the UR Queue type. We therefore need to submit the list
@@ -940,6 +896,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
940896 // submission dependencies have been satisfied.
941897 ZE2UR_CALL (zeCommandListAppendEventReset,
942898 (SignalCommandList->first , CommandBuffer->WaitEvent ->ZeEvent ));
899+ // Reset the all-reset-event for the UR command-buffer that is signaled when
900+ // all events of the main command-list have been reset.
901+ ZE2UR_CALL (zeCommandListAppendEventReset,
902+ (SignalCommandList->first , CommandBuffer->AllResetEvent ->ZeEvent ));
943903
944904 if (Event) {
945905 UR_CALL (createEventAndAssociateQueue (
@@ -955,14 +915,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
955915 // engine to recover these timestamps.
956916 command_buffer_profiling_t *Profiling = new command_buffer_profiling_t ();
957917
958- Profiling->NumEvents = WaitEventList .size ();
918+ Profiling->NumEvents = CommandBuffer-> ZeEventsList .size ();
959919 Profiling->Timestamps =
960920 new ze_kernel_timestamp_result_t [Profiling->NumEvents ];
961921
962922 ZE2UR_CALL (zeCommandListAppendQueryKernelTimestamps,
963- (SignalCommandList->first , WaitEventList .size (),
964- WaitEventList .data (), ( void *)Profiling-> Timestamps , 0 ,
965- RetEvent->ZeEvent , 1 ,
923+ (SignalCommandList->first , CommandBuffer-> ZeEventsList .size (),
924+ CommandBuffer-> ZeEventsList .data (),
925+ ( void *)Profiling-> Timestamps , 0 , RetEvent->ZeEvent , 1 ,
966926 &(CommandBuffer->SignalEvent ->ZeEvent )));
967927
968928 RetEvent->CommandData = static_cast <void *>(Profiling);
0 commit comments