@@ -58,16 +58,15 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
58
58
ze_command_list_handle_t CommandList,
59
59
ze_command_list_handle_t CommandListTranslated,
60
60
ze_command_list_handle_t CommandListResetEvents,
61
- ze_command_list_handle_t CopyCommandList,
62
- ZeStruct<ze_command_list_desc_t > ZeDesc,
63
- ZeStruct<ze_command_list_desc_t > ZeCopyDesc,
61
+ ze_command_list_handle_t CopyCommandList, ur_event_handle_t SignalEvent,
62
+ ur_event_handle_t WaitEvent, ur_event_handle_t AllResetEvent,
64
63
const ur_exp_command_buffer_desc_t *Desc, const bool IsInOrderCmdList)
65
64
: Context(Context), Device(Device), ZeComputeCommandList(CommandList),
66
65
ZeComputeCommandListTranslated(CommandListTranslated),
67
66
ZeCommandListResetEvents(CommandListResetEvents),
68
- ZeCommandListDesc(ZeDesc ), ZeCopyCommandList(CopyCommandList ),
69
- ZeCopyCommandListDesc(ZeCopyDesc ), ZeFencesMap( ), ZeActiveFence( nullptr ),
70
- SyncPoints(), NextSyncPoint(0 ),
67
+ ZeCopyCommandList(CopyCommandList ), SignalEvent(SignalEvent ),
68
+ WaitEvent(WaitEvent ), AllResetEvent(AllResetEvent ), ZeFencesMap( ),
69
+ ZeActiveFence( nullptr ), SyncPoints(), NextSyncPoint(0 ),
71
70
IsUpdatable(Desc ? Desc->isUpdatable : false ),
72
71
IsProfilingEnabled(Desc ? Desc->enableProfiling : false ),
73
72
IsInOrderCmdList(IsInOrderCmdList) {
@@ -540,30 +539,18 @@ static ur_result_t enqueueCommandBufferFillHelper(
540
539
return UR_RESULT_SUCCESS;
541
540
}
542
541
543
- UR_APIEXPORT ur_result_t UR_APICALL
544
- urCommandBufferCreateExp (ur_context_handle_t Context, ur_device_handle_t Device,
545
- const ur_exp_command_buffer_desc_t *CommandBufferDesc,
546
- ur_exp_command_buffer_handle_t *CommandBuffer) {
547
- // In-order command-lists are not available in old driver version.
548
- bool CompatibleDriver = IsDriverVersionNewerOrSimilar (Context, 1 , 3 , 28454 );
549
- const bool IsInOrder =
550
- CompatibleDriver
551
- ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false )
552
- : false ;
542
+ static ur_result_t
543
+ createMainCommandList (ur_context_handle_t Context, ur_device_handle_t Device,
544
+ bool IsInOrder, bool isUpdatable, bool isCopy,
545
+ ze_command_list_handle_t &CommandList) {
553
546
554
- uint32_t QueueGroupOrdinal =
555
- Device-> QueueGroup [ ur_device_handle_t_::queue_group_info_t ::type::Compute]
556
- .ZeOrdinal ;
547
+ auto type = isCopy ? ur_device_handle_t_:: queue_group_info_t ::type::MainCopy
548
+ : ur_device_handle_t_::queue_group_info_t ::type::Compute;
549
+ uint32_t QueueGroupOrdinal = Device-> QueueGroup [type] .ZeOrdinal ;
557
550
558
551
ZeStruct<ze_command_list_desc_t > ZeCommandListDesc;
559
552
ZeCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinal;
560
553
561
- ze_command_list_handle_t ZeCommandListResetEvents;
562
- // Create a command-list for reseting the events associated to enqueued cmd.
563
- ZE2UR_CALL (zeCommandListCreate,
564
- (Context->ZeContext , Device->ZeDevice , &ZeCommandListDesc,
565
- &ZeCommandListResetEvents));
566
-
567
554
// For non-linear graph, dependencies between commands are explicitly enforced
568
555
// by sync points when enqueuing. Consequently, relax the command ordering in
569
556
// the command list can enable the backend to further optimize the workload
@@ -573,41 +560,77 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
573
560
DEBUG_LOG (ZeCommandListDesc.flags );
574
561
575
562
ZeStruct<ze_mutable_command_list_exp_desc_t > ZeMutableCommandListDesc;
576
- if (CommandBufferDesc && CommandBufferDesc-> isUpdatable ) {
563
+ if (isUpdatable) {
577
564
ZeMutableCommandListDesc.flags = 0 ;
578
565
ZeCommandListDesc.pNext = &ZeMutableCommandListDesc;
579
566
}
580
567
581
- ze_command_list_handle_t ZeComputeCommandList;
582
- // TODO We could optimize this by pooling both Level Zero command-lists and UR
583
- // command-buffers, then reusing them.
584
568
ZE2UR_CALL (zeCommandListCreate, (Context->ZeContext , Device->ZeDevice ,
585
- &ZeCommandListDesc, &ZeComputeCommandList ));
569
+ &ZeCommandListDesc, &CommandList ));
586
570
587
- // Create a list for copy commands.
588
- // Note that to simplify the implementation, the current implementation only
589
- // uses the main copy engine and does not use the link engine even if
590
- // available.
571
+ return UR_RESULT_SUCCESS;
572
+ }
573
+
574
+ static ur_result_t
575
+ appendPreconditionEvents (ze_command_list_handle_t CommandList,
576
+ ur_event_handle_t WaitEvent,
577
+ ur_event_handle_t AllResetEvent) {
578
+ std::vector<ze_event_handle_t > PrecondEvents = {WaitEvent->ZeEvent ,
579
+ AllResetEvent->ZeEvent };
580
+ ZE2UR_CALL (
581
+ zeCommandListAppendBarrier,
582
+ (CommandList, nullptr , PrecondEvents.size (), PrecondEvents.data ()));
583
+ }
584
+
585
+ static bool
586
+ enableInOrder (ur_context_handle_t Context,
587
+ const ur_exp_command_buffer_desc_t *CommandBufferDesc) {
588
+ // In-order command-lists are not available in old driver version.
589
+ bool CompatibleDriver = IsDriverVersionNewerOrSimilar (Context, 1 , 3 , 28454 );
590
+ return CompatibleDriver
591
+ ? (CommandBufferDesc ? CommandBufferDesc->isInOrder : false )
592
+ : false ;
593
+ }
594
+ UR_APIEXPORT ur_result_t UR_APICALL
595
+ urCommandBufferCreateExp (ur_context_handle_t Context, ur_device_handle_t Device,
596
+ const ur_exp_command_buffer_desc_t *CommandBufferDesc,
597
+ ur_exp_command_buffer_handle_t *CommandBuffer) {
598
+
599
+ ur_event_handle_t SignalEvent;
600
+ ur_event_handle_t WaitEvent;
601
+ ur_event_handle_t AllResetEvent;
602
+
603
+ UR_CALL (EventCreate (Context, nullptr , false , false , &SignalEvent, false ,
604
+ !CommandBufferDesc->enableProfiling ));
605
+ UR_CALL (EventCreate (Context, nullptr , false , false , &WaitEvent, false ,
606
+ !CommandBufferDesc->enableProfiling ));
607
+ UR_CALL (EventCreate (Context, nullptr , false , false , &AllResetEvent, false ,
608
+ !CommandBufferDesc->enableProfiling ));
609
+
610
+ bool IsInOrder = enableInOrder (Context, CommandBufferDesc);
611
+ bool IsUpdatable = CommandBufferDesc && CommandBufferDesc->isUpdatable ;
612
+
613
+ ze_command_list_handle_t ZeComputeCommandList = nullptr ;
614
+ UR_CALL (createMainCommandList (Context, Device, IsInOrder, IsUpdatable, false ,
615
+ ZeComputeCommandList));
616
+ UR_CALL (
617
+ appendPreconditionEvents (ZeComputeCommandList, WaitEvent, AllResetEvent));
618
+
619
+ ze_command_list_handle_t ZeCommandListResetEvents = nullptr ;
620
+ UR_CALL (createMainCommandList (Context, Device, false , false , false ,
621
+ ZeCommandListResetEvents));
622
+ ZE2UR_CALL (zeCommandListAppendEventReset,
623
+ (ZeCommandListResetEvents, SignalEvent->ZeEvent ));
624
+
625
+ // Create a list for copy commands. Note that to simplify the implementation,
626
+ // the current implementation only uses the main copy engine and does not use
627
+ // the link engine even if available.
591
628
ze_command_list_handle_t ZeCopyCommandList = nullptr ;
592
- ZeStruct<ze_command_list_desc_t > ZeCopyCommandListDesc;
593
629
if (Device->hasMainCopyEngine ()) {
594
- uint32_t QueueGroupOrdinalCopy =
595
- Device
596
- ->QueueGroup
597
- [ur_device_handle_t_::queue_group_info_t ::type::MainCopy]
598
- .ZeOrdinal ;
599
-
600
- ZeCopyCommandListDesc.commandQueueGroupOrdinal = QueueGroupOrdinalCopy;
601
- // Dependencies between commands are explicitly enforced by sync points when
602
- // enqueuing. Consequently, relax the command ordering in the command list
603
- // can enable the backend to further optimize the workload
604
- ZeCopyCommandListDesc.flags = ZE_COMMAND_LIST_FLAG_RELAXED_ORDERING;
605
-
606
- // TODO We could optimize this by pooling both Level Zero command-lists and
607
- // UR command-buffers, then reusing them.
608
- ZE2UR_CALL (zeCommandListCreate,
609
- (Context->ZeContext , Device->ZeDevice , &ZeCopyCommandListDesc,
610
- &ZeCopyCommandList));
630
+ UR_CALL (createMainCommandList (Context, Device, false , false , true ,
631
+ ZeCopyCommandList));
632
+ UR_CALL (
633
+ appendPreconditionEvents (ZeCopyCommandList, WaitEvent, AllResetEvent));
611
634
}
612
635
613
636
ze_command_list_handle_t ZeComputeCommandListTranslated = nullptr ;
@@ -618,46 +641,14 @@ urCommandBufferCreateExp(ur_context_handle_t Context, ur_device_handle_t Device,
618
641
try {
619
642
*CommandBuffer = new ur_exp_command_buffer_handle_t_ (
620
643
Context, Device, ZeComputeCommandList, ZeComputeCommandListTranslated,
621
- ZeCommandListResetEvents, ZeCopyCommandList, ZeCommandListDesc ,
622
- ZeCopyCommandListDesc , CommandBufferDesc, IsInOrder);
644
+ ZeCommandListResetEvents, ZeCopyCommandList, SignalEvent, WaitEvent ,
645
+ AllResetEvent , CommandBufferDesc, IsInOrder);
623
646
} catch (const std::bad_alloc &) {
624
647
return UR_RESULT_ERROR_OUT_OF_HOST_MEMORY;
625
648
} catch (...) {
626
649
return UR_RESULT_ERROR_UNKNOWN;
627
650
}
628
651
629
- // Create signal & wait events to be used in the command-list for sync
630
- // on command-buffer enqueue.
631
- auto RetCommandBuffer = *CommandBuffer;
632
- UR_CALL (EventCreate (Context, nullptr , false , false ,
633
- &RetCommandBuffer->SignalEvent , false ,
634
- !RetCommandBuffer->IsProfilingEnabled ));
635
- UR_CALL (EventCreate (Context, nullptr , false , false ,
636
- &RetCommandBuffer->WaitEvent , false ,
637
- !RetCommandBuffer->IsProfilingEnabled ));
638
- UR_CALL (EventCreate (Context, nullptr , false , false ,
639
- &RetCommandBuffer->AllResetEvent , false ,
640
- !RetCommandBuffer->IsProfilingEnabled ));
641
-
642
- // Add prefix commands
643
- ZE2UR_CALL (
644
- zeCommandListAppendEventReset,
645
- (ZeCommandListResetEvents, RetCommandBuffer->SignalEvent ->ZeEvent ));
646
- std::vector<ze_event_handle_t > PrecondEvents = {
647
- RetCommandBuffer->WaitEvent ->ZeEvent ,
648
- RetCommandBuffer->AllResetEvent ->ZeEvent };
649
- ZE2UR_CALL (zeCommandListAppendBarrier,
650
- (ZeComputeCommandList, nullptr , PrecondEvents.size (),
651
- PrecondEvents.data ()));
652
-
653
- if (Device->hasMainCopyEngine ()) {
654
- // The copy command-list must be executed once the preconditions have been
655
- // met. We therefore begin this command-list with a barrier on the
656
- // preconditions.
657
- ZE2UR_CALL (zeCommandListAppendBarrier,
658
- (ZeCopyCommandList, nullptr , PrecondEvents.size (),
659
- PrecondEvents.data ()));
660
- }
661
652
return UR_RESULT_SUCCESS;
662
653
}
663
654
@@ -1164,6 +1155,7 @@ ur_result_t ur_exp_command_buffer_handle_t_::getFence(
1164
1155
ZeFence = ZeWorkloadFenceForQueue->second ;
1165
1156
ZE2UR_CALL (zeFenceReset, (ZeFence));
1166
1157
}
1158
+ this ->ZeActiveFence = ZeFence;
1167
1159
return UR_RESULT_SUCCESS;
1168
1160
}
1169
1161
@@ -1273,7 +1265,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
1273
1265
auto Queue = Legacy (UrQueue);
1274
1266
std::scoped_lock<ur_shared_mutex> lock (Queue->Mutex );
1275
1267
1276
- const auto UseCopyEngine = false ;
1277
1268
ze_command_queue_handle_t ZeCommandQueue;
1278
1269
CommandBuffer->getZeCommandQueue (Queue, false , ZeCommandQueue);
1279
1270
@@ -1309,10 +1300,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urCommandBufferEnqueueExp(
1309
1300
(ZeCopyCommandQueue, 1 , &CommandBuffer->ZeCopyCommandList , nullptr ));
1310
1301
}
1311
1302
1312
- // Execution event for this enqueue of the UR command-buffer
1313
- ur_event_handle_t RetEvent{};
1314
-
1315
- // Create a command-list to signal RetEvent on completion
1303
+ // Create a command-list to signal the Event on completion
1316
1304
ur_command_list_ptr_t SignalCommandList{};
1317
1305
UR_CALL (Queue->Context ->getAvailableCommandList (Queue, SignalCommandList,
1318
1306
false , NumEventsInWaitList,
0 commit comments