From 1e10ac6b33ce0951bd47fe045c47294e3560bf9e Mon Sep 17 00:00:00 2001 From: Agata Momot Date: Mon, 7 Jul 2025 12:49:52 +0000 Subject: [PATCH 1/3] add support for batched queues in L0v2 adapter MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Batched queues enable submission of operations to the driver in batches, therefore reducing the overhead of submitting every single operation individually. Similarly to command buffers in L0v2, they use regular command lists (later referenced as 'batches'). Operations enqueued on regular command lists are not executed immediately, but only after enqueueing the regular command list on an immediate command list. However, in contrast to command buffers, batched queues also handle submission of batches (regular command lists) instead of only collecting enqueued operations, by using an internal immediate command list. Batched queues introduce: - batch_manager stores the current batch, the command list manager with an immediate command list for batch submissions, the vector of submitted batches, the generation number of the current batch. - The current batch is a command list manager with a regular command list; operations requested by users are enqueued on the current batch. The current batch may be submitted for execution on the immediate command list, replaced by a new regular command list and stored for execution completion in the vector of submitted batches. - The number of regular command lists stored for execution is limited. - The generation number of the current batch is assigned to events associated with operations enqueued on the given batch. It is incremented during every replacement of the current batch. When an event created by a batched queue appears in an eventWaitList, the batch assigned to the given event might not have been executed yet and the event might never be signalled. Comparing generation numbers enables determining whether the current batch should be submitted for execution. If the generation number of the current batch is higher than the number assigned to the given event, the batch associated with the event has already been submitted for execution and additional submission of the current batch is not needed. - Regular command lists use the regular pool cache type, whereas immediate command lists use the immediate pool cache type. Since user-requested operations are enqueued on regular command lists and immediate command lists are only used internally by the batched queue implementation, events are not created for immediate command lists. - wait_list_view is modified. Previously, it only stored the waitlist (as a ze_event_handle buffer created from events) and the corresponding event count in a single container, which could be passed as an argument to the driver API. Currently, the constructor also ensures that all associated operations will eventually be executed. Since regular command lists are not executed immediately, but only after enqueueing on immediate lists, it is necessary to enqueue the regular command list associated with the given event. Otherwise, the event would never be signalled. Additionally, support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo has been added for native CPU, which is required by the enqueueTimestampRecording tests. Currently, enqueueTimestampRecording is not supported by batched queues. Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in ur_queue_flags_t or globally, through the environment variable UR_L0_FORCE_BATCHED=1. Benchmark results for default in-order queues (sycl branch, commit hash: b76f12e554760c3fcfc55f1f815a76b0d8b208ad) and batched queues: api_overhead_benchmark_ur SubmitKernel in order: 20.839 μs api_overhead_benchmark_ur SubmitKernel batched: 12.183 μs --- .../scripts/templates/queue_api.hpp.mako | 3 +- .../source/adapters/level_zero/CMakeLists.txt | 2 + .../adapters/level_zero/v2/command_buffer.cpp | 85 +- .../level_zero/v2/command_list_manager.cpp | 364 +++--- .../level_zero/v2/command_list_manager.hpp | 186 ++- .../source/adapters/level_zero/v2/event.cpp | 20 + .../source/adapters/level_zero/v2/event.hpp | 8 + .../adapters/level_zero/v2/queue_api.hpp | 3 +- .../adapters/level_zero/v2/queue_batched.cpp | 1042 +++++++++++++++++ .../adapters/level_zero/v2/queue_batched.hpp | 439 +++++++ .../adapters/level_zero/v2/queue_create.cpp | 18 + .../level_zero/v2/queue_extensions.hpp | 22 + .../adapters/level_zero/v2/queue_handle.hpp | 4 +- .../v2/queue_immediate_in_order.cpp | 35 +- .../v2/queue_immediate_in_order.hpp | 199 +++- .../v2/queue_immediate_out_of_order.cpp | 23 +- .../v2/queue_immediate_out_of_order.hpp | 202 +++- .../source/adapters/native_cpu/queue.cpp | 15 + .../enqueue/urEnqueueTimestampRecording.cpp | 10 + .../test/conformance/queue/urQueueGetInfo.cpp | 2 - 20 files changed, 2244 insertions(+), 438 deletions(-) create mode 100644 unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp create mode 100644 unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp create mode 100644 unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp diff --git a/unified-runtime/scripts/templates/queue_api.hpp.mako b/unified-runtime/scripts/templates/queue_api.hpp.mako index 25f53d1b79180..731fb7d1bc692 100644 --- a/unified-runtime/scripts/templates/queue_api.hpp.mako +++ b/unified-runtime/scripts/templates/queue_api.hpp.mako @@ -25,8 +25,9 @@ from templates import helper as th #pragma once #include +#include "queue_extensions.hpp" -struct ur_queue_t_ { +struct ur_queue_t_ : ur_queue_extensions { virtual ~ur_queue_t_(); %for obj in th.get_queue_related_functions(specs, n, tags): diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 54f303a7823c9..d49868a927292 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -171,6 +171,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/lockable.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_batched.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp @@ -187,6 +188,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_batched.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index b4c2674bd3364..80e2dedb82e23 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -12,6 +12,7 @@ #include "../command_buffer_command.hpp" #include "../helpers/kernel_helpers.hpp" #include "../ur_interface_loader.hpp" +#include "command_list_manager.hpp" #include "logger/ur_logger.hpp" #include "queue_handle.hpp" @@ -323,9 +324,12 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( auto eventsWaitList = commandBuffer->getWaitListFromSyncPoints( syncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, - nullptr, numSyncPointsInWaitList, eventsWaitList, + nullptr, waitListView, commandBuffer->createEventIfRequested(retSyncPoint))); return UR_RESULT_SUCCESS; @@ -348,8 +352,11 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMMemcpy( - false, pDst, pSrc, size, numSyncPointsInWaitList, eventsWaitList, + false, pDst, pSrc, size, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -375,9 +382,12 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferCopy( - hSrcMem, hDstMem, srcOffset, dstOffset, size, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hSrcMem, hDstMem, srcOffset, dstOffset, size, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -402,9 +412,12 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferWrite( - hBuffer, false, offset, size, pSrc, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hBuffer, false, offset, size, pSrc, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -427,9 +440,12 @@ ur_result_t urCommandBufferAppendMemBufferReadExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferRead( - hBuffer, false, offset, size, pDst, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hBuffer, false, offset, size, pDst, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -456,10 +472,13 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferCopyRect( hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -486,10 +505,12 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferWriteRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numSyncPointsInWaitList, eventsWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -517,10 +538,12 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferReadRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numSyncPointsInWaitList, eventsWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -543,9 +566,12 @@ ur_result_t urCommandBufferAppendUSMFillExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMFill( - pMemory, patternSize, pPattern, size, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + pMemory, patternSize, pPattern, size, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -567,9 +593,12 @@ ur_result_t urCommandBufferAppendMemBufferFillExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hBuffer, pPattern, patternSize, offset, size, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -593,8 +622,11 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMPrefetch( - pMemory, size, flags, numSyncPointsInWaitList, eventsWaitList, + pMemory, size, flags, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -617,8 +649,11 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMAdvise( - pMemory, size, advice, numSyncPointsInWaitList, eventsWaitList, + pMemory, size, advice, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -667,15 +702,19 @@ ur_result_t urCommandBufferAppendNativeCommandExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - UR_CALL(commandListLocked->appendEventsWaitWithBarrier( - numSyncPointsInWaitList, eventsWaitList, nullptr)); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + + UR_CALL( + commandListLocked->appendEventsWaitWithBarrier(waitListView, nullptr)); // Call user-defined function immediately pfnNativeCommand(pData); + wait_list_view emptyWaitList = wait_list_view(nullptr, 0); // Barrier on all commands after user defined commands. UR_CALL(commandListLocked->appendEventsWaitWithBarrier( - 0, nullptr, hCommandBuffer->createEventIfRequested(pSyncPoint))); + emptyWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 142caaecc1a71..ec350e4ea98c7 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -17,20 +17,118 @@ #include "kernel.hpp" #include "memory.hpp" +thread_local std::vector waitList; + +/* +The wait_list_view is a wrapper for eventsWaitLists, which: + - enables passing a ze_event_handle_t buffer created from events as an +argument for the driver API; + - handles enqueueing operations associated with given events if these +operations have not already been set for execution. + +Previously, it only stored the waitlist and the corresponding event count in a +single container. Currently, the constructor also ensures that all associated +operations will eventually be executed, which is required for batched queues in +L0v2. + +Wait events might have been created in batched queues, which use regular +command lists (batches). Since regular command lists are not executed +immediately, but only after enqueueing on immediate lists, it is necessary to +enqueue the regular command list associated with the given event. Otherwise, the +event would never be signalled. The enqueueing is performed in onWaitListView(). + +In the case of batched queues, the function onWaitListView() is not called if +the current queue created the given event. The operation associated with the +given wait_list_view is added to the current batch of the queue. The entire +batch is then enqueued for execution, i.e., as part of queueFinish or +queueFlush. For the same queue, events from the given eventsWaitList are +enqueued before the associated operation is executed. +*/ +template +void getZeHandlesBuffer(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents, + ur_queue_t_ *currentBatchedQueue) { + for (uint32_t i = 0; i < numWaitEvents; i++) { + // checking if the current queue has created the given event applies only + // to batched queues + if constexpr (HasBatchedQueue) { + if (currentBatchedQueue != phWaitEvents[i]->getQueue()) { + phWaitEvents[i]->onWaitListUse(); + } + } + waitList[i] = phWaitEvents[i]->getZeEvent(); + } +} + +void wait_list_view::init(uint32_t numWaitEvents) { + num = numWaitEvents; + max_size = num + 1; + + waitList.resize(max_size); +} + +void wait_list_view::setHandles(const ur_event_handle_t *phWaitEvents) { + // vector.data() does not guarantee the null being returned in case of an + // empty vector. + // Explicit handling nullptr prevents passing uninitialized buffer to the + // driver + handles = phWaitEvents == nullptr ? nullptr : waitList.data(); +} + +wait_list_view::wait_list_view(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { + init(numWaitEvents); + getZeHandlesBuffer(phWaitEvents, numWaitEvents, nullptr); + setHandles(phWaitEvents); +} + +wait_list_view::wait_list_view(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents, + ur_queue_t_ *currentBatchedQueue) { + + init(numWaitEvents); + getZeHandlesBuffer(phWaitEvents, numWaitEvents, currentBatchedQueue); + setHandles(phWaitEvents); +} + +// At most one additional event might be added after creating the given waitlist +void wait_list_view::addEvent(ur_event_handle_t Event) { + if (Event) { + if (handles) { + assert(num != max_size); + handles[num] = Event->getZeEvent(); + num++; + } else { + waitList.resize(0); + waitList.emplace_back(Event->getZeEvent()); + num++; + handles = waitList.data(); + } + } +} + ur_command_list_manager::ur_command_list_manager( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList) : hContext(context), hDevice(device), zeCommandList(std::move(commandList)) {} +v2::raii::command_list_unique_handle && +ur_command_list_manager::releaseCommandList() { + return std::move(zeCommandList); +} + +void ur_command_list_manager::replaceCommandList( + v2::raii::command_list_unique_handle &&cmdlist) { + zeCommandList = std::move(cmdlist); +} + ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_mem_buffer_t *dst, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType) { + const void *pPattern, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType) { auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pDst = ur_cast(dst->getDevicePtr( hDevice.get(), ur_mem_buffer_t::device_access_mode_t::read_only, offset, @@ -63,11 +161,9 @@ ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_result_t ur_command_list_manager::appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, - size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType) { + size_t dstOffset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType) { auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( hDevice.get(), ur_mem_buffer_t::device_access_mode_t::read_only, @@ -92,14 +188,12 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType) { + size_t dstRowPitch, size_t dstSlicePitch, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType) { auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, dstRowPitch, srcSlicePitch, dstSlicePitch); auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( hDevice.get(), ur_mem_buffer_t::device_access_mode_t::read_only, 0, @@ -121,22 +215,6 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( return UR_RESULT_SUCCESS; } -wait_list_view ur_command_list_manager::getWaitListView( - const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent) { - - uint32_t totalNumWaitEvents = - numWaitEvents + (additionalWaitEvent != nullptr ? 1 : 0); - waitList.resize(totalNumWaitEvents); - for (uint32_t i = 0; i < numWaitEvents; i++) { - waitList[i] = phWaitEvents[i]->getZeEvent(); - } - if (additionalWaitEvent != nullptr) { - waitList[totalNumWaitEvents - 1] = additionalWaitEvent->getZeEvent(); - } - return {waitList.data(), static_cast(totalNumWaitEvents)}; -} - ze_event_handle_t ur_command_list_manager::getSignalEvent(ur_event_handle_t hUserEvent, ur_command_t commandType) { @@ -151,9 +229,8 @@ ur_command_list_manager::getSignalEvent(ur_event_handle_t hUserEvent, ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - bool cooperative) { + const size_t *pLocalWorkSize, wait_list_view &waitListView, + ur_event_handle_t phEvent, bool cooperative) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -171,7 +248,6 @@ ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked( pGlobalWorkSize, pLocalWorkSize)); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); UR_CALL(hKernel->prepareForSubmission( hContext.get(), hDevice.get(), pGlobalWorkOffset, workDim, WG[0], WG[1], @@ -203,8 +279,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; @@ -212,10 +287,9 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && launchPropList[propIndex].value.cooperative) { - UR_CALL(appendKernelLaunchUnlocked(hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, - phEvent, true /* cooperative */)); + UR_CALL(appendKernelLaunchUnlocked( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + waitListView, phEvent, true /* cooperative */)); return UR_RESULT_SUCCESS; } if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && @@ -228,20 +302,18 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( UR_CALL(appendKernelLaunchUnlocked( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent, false /* cooperative */)); + waitListView, phEvent, false /* cooperative */)); return UR_RESULT_SUCCESS; } ur_result_t ur_command_list_manager::appendUSMMemcpy( bool blocking, void *pDst, const void *pSrc, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ZE2UR_CALL(zeCommandListAppendMemoryCopy, (zeCommandList.get(), pDst, pSrc, size, zeSignalEvent, @@ -256,8 +328,8 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy( ur_result_t ur_command_list_manager::appendMemBufferFill( ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t offset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill"); auto hBuffer = hMem->getBuffer(); @@ -266,26 +338,23 @@ ur_result_t ur_command_list_manager::appendMemBufferFill( std::scoped_lock lock(hBuffer->getMutex()); return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_FILL); + waitListView, phEvent, + UR_COMMAND_MEM_BUFFER_FILL); } ur_result_t ur_command_list_manager::appendUSMFill( void *pMem, size_t patternSize, const void *pPattern, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill"); ur_usm_handle_t dstHandle(hContext.get(), size, pMem); return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_USM_FILL); + waitListView, phEvent, UR_COMMAND_USM_FILL); } ur_result_t ur_command_list_manager::appendUSMPrefetch( const void *pMem, size_t size, ur_usm_migration_flags_t flags, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); switch (flags) { @@ -301,8 +370,7 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( } auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (pWaitEvents) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -323,15 +391,13 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( ur_result_t ur_command_list_manager::appendUSMAdvise( const void *pMem, size_t size, ur_usm_advice_flags_t advice, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); auto zeAdvice = ur_cast(advice); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (pWaitEvents) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -350,8 +416,7 @@ ur_result_t ur_command_list_manager::appendUSMAdvise( ur_result_t ur_command_list_manager::appendMemBufferRead( ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + void *pDst, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferRead"); auto hBuffer = hMem->getBuffer(); @@ -362,14 +427,13 @@ ur_result_t ur_command_list_manager::appendMemBufferRead( std::scoped_lock lock(hBuffer->getMutex()); return appendGenericCopyUnlocked(hBuffer, &dstHandle, blockingRead, offset, 0, - size, numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_READ); + size, waitListView, phEvent, + UR_COMMAND_MEM_BUFFER_READ); } ur_result_t ur_command_list_manager::appendMemBufferWrite( ur_mem_handle_t hMem, bool blockingWrite, size_t offset, size_t size, - const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + const void *pSrc, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWrite"); auto hBuffer = hMem->getBuffer(); @@ -379,15 +443,15 @@ ur_result_t ur_command_list_manager::appendMemBufferWrite( std::scoped_lock lock(hBuffer->getMutex()); - return appendGenericCopyUnlocked( - &srcHandle, hBuffer, blockingWrite, 0, offset, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE); + return appendGenericCopyUnlocked(&srcHandle, hBuffer, blockingWrite, 0, + offset, size, waitListView, phEvent, + UR_COMMAND_MEM_BUFFER_WRITE); } ur_result_t ur_command_list_manager::appendMemBufferCopy( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, size_t srcOffset, - size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t dstOffset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopy"); auto hBufferSrc = hSrc->getBuffer(); @@ -402,8 +466,7 @@ ur_result_t ur_command_list_manager::appendMemBufferCopy( hBufferSrc->getMutex(), hBufferDst->getMutex()); return appendGenericCopyUnlocked(hBufferSrc, hBufferDst, false, srcOffset, - dstOffset, size, numEventsInWaitList, - phEventWaitList, phEvent, + dstOffset, size, waitListView, phEvent, UR_COMMAND_MEM_BUFFER_COPY); } @@ -411,8 +474,7 @@ ur_result_t ur_command_list_manager::appendMemBufferReadRect( ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + void *pDst, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferReadRect"); auto hBuffer = hMem->getBuffer(); @@ -423,16 +485,14 @@ ur_result_t ur_command_list_manager::appendMemBufferReadRect( return appendRegionCopyUnlocked( hBuffer, &dstHandle, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, - numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_READ_RECT); + waitListView, phEvent, UR_COMMAND_MEM_BUFFER_READ_RECT); } ur_result_t ur_command_list_manager::appendMemBufferWriteRect( ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + void *pSrc, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWriteRect"); auto hBuffer = hMem->getBuffer(); @@ -443,16 +503,14 @@ ur_result_t ur_command_list_manager::appendMemBufferWriteRect( return appendRegionCopyUnlocked( &srcHandle, hBuffer, blockingWrite, hostOrigin, bufferOrigin, region, hostRowPitch, hostSlicePitch, bufferRowPitch, bufferSlicePitch, - numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_WRITE_RECT); + waitListView, phEvent, UR_COMMAND_MEM_BUFFER_WRITE_RECT); } ur_result_t ur_command_list_manager::appendMemBufferCopyRect( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopyRect"); auto hBufferSrc = hSrc->getBuffer(); @@ -461,16 +519,16 @@ ur_result_t ur_command_list_manager::appendMemBufferCopyRect( std::scoped_lock lock( hBufferSrc->getMutex(), hBufferDst->getMutex()); - return appendRegionCopyUnlocked( - hBufferSrc, hBufferDst, false, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY_RECT); + return appendRegionCopyUnlocked(hBufferSrc, hBufferDst, false, srcOrigin, + dstOrigin, region, srcRowPitch, srcSlicePitch, + dstRowPitch, dstSlicePitch, waitListView, + phEvent, UR_COMMAND_MEM_BUFFER_COPY_RECT); } ur_result_t ur_command_list_manager::appendUSMMemcpy2D( bool blocking, void *pDst, size_t dstPitch, const void *pSrc, - size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t srcPitch, size_t width, size_t height, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy2D"); ur_rect_offset_t zeroOffset{0, 0, 0}; @@ -481,21 +539,19 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy2D( return appendRegionCopyUnlocked(&srcHandle, &dstHandle, blocking, zeroOffset, zeroOffset, region, srcPitch, 0, dstPitch, 0, - numEventsInWaitList, phEventWaitList, phEvent, + waitListView, phEvent, UR_COMMAND_USM_MEMCPY_2D); } ur_result_t ur_command_list_manager::appendTimestampRecordingExp( - bool blocking, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + bool blocking, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendTimestampRecordingExp"); if (!phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; phEvent->recordStartTimestamp(); @@ -515,14 +571,13 @@ ur_result_t ur_command_list_manager::appendTimestampRecordingExp( ur_result_t ur_command_list_manager::appendGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent) { + ur_event_handle_t phEvent, wait_list_view &waitListView, + ur_command_t callerCommand, ur_event_handle_t additionalWaitEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendGenericCommandListsExp"); auto zeSignalEvent = getSignalEvent(phEvent, callerCommand); - auto [pWaitEvents, numWaitEvents] = getWaitListView( - phEventWaitList, numEventsInWaitList, additionalWaitEvent); + waitListView.addEvent(additionalWaitEvent); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, (getZeCommandList(), numCommandLists, phCommandLists, @@ -532,8 +587,8 @@ ur_result_t ur_command_list_manager::appendGenericCommandListsExp( } ur_result_t ur_command_list_manager::appendCommandBufferExp( - ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + ur_exp_command_buffer_handle_t hCommandBuffer, wait_list_view &waitListView, + ur_event_handle_t phEvent) { auto bufferCommandListLocked = hCommandBuffer->commandListManager.lock(); ze_command_list_handle_t commandBufferCommandList = @@ -549,9 +604,10 @@ ur_result_t ur_command_list_manager::appendCommandBufferExp( (executionEvent->getZeEvent(), UINT64_MAX)); } - UR_CALL(appendGenericCommandListsExp( - 1, &commandBufferCommandList, phEvent, numEventsInWaitList, - phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent)); + UR_CALL(appendGenericCommandListsExp(1, &commandBufferCommandList, phEvent, + waitListView, + UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, + /* already synchronized */ nullptr)); UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(phEvent)); return UR_RESULT_SUCCESS; @@ -560,14 +616,12 @@ ur_result_t ur_command_list_manager::appendCommandBufferExp( ur_result_t ur_command_list_manager::appendMemImageRead( ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageRead"); auto hImage = hMem->getImage(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_READ); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto [zeImage, zeRegion] = hImage->getRWRegion(origin, region, rowPitch, slicePitch); @@ -586,14 +640,12 @@ ur_result_t ur_command_list_manager::appendMemImageRead( ur_result_t ur_command_list_manager::appendMemImageWrite( ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageWrite"); auto hImage = hMem->getImage(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_WRITE); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto [zeImage, zeRegion] = hImage->getRWRegion(origin, region, rowPitch, slicePitch); @@ -612,15 +664,13 @@ ur_result_t ur_command_list_manager::appendMemImageWrite( ur_result_t ur_command_list_manager::appendMemImageCopy( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageWrite"); auto hImageSrc = hSrc->getImage(); auto hImageDst = hDst->getImage(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto desc = ur_mem_image_t::getCopyRegions(*hImageSrc, *hImageDst, srcOrigin, dstOrigin, region); @@ -638,9 +688,8 @@ ur_result_t ur_command_list_manager::appendMemImageCopy( ur_result_t ur_command_list_manager::appendMemBufferMap( ur_mem_handle_t hMem, bool blockingMap, ur_map_flags_t mapFlags, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - void **ppRetMap) { + size_t offset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent, void **ppRetMap) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferMap"); auto hBuffer = hMem->getBuffer(); @@ -648,7 +697,6 @@ ur_result_t ur_command_list_manager::appendMemBufferMap( std::scoped_lock lock(hBuffer->getMutex()); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pDst = ur_cast(hBuffer->mapHostPtr( mapFlags, offset, size, zeCommandList.get(), waitListView)); @@ -672,15 +720,15 @@ ur_result_t ur_command_list_manager::appendMemBufferMap( return UR_RESULT_SUCCESS; } -ur_result_t ur_command_list_manager::appendMemUnmap( - ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { +ur_result_t +ur_command_list_manager::appendMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemUnmap"); auto hBuffer = hMem->getBuffer(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); // TODO: currently unmapHostPtr deallocates memory immediately, // since the memory might be used by the user, we need to make sure @@ -700,9 +748,7 @@ ur_result_t ur_command_list_manager::appendMemUnmap( ur_result_t ur_command_list_manager::appendUSMFill2D( void * /*pMem*/, size_t /*pitch*/, size_t /*patternSize*/, const void * /*pPattern*/, size_t /*width*/, size_t /*height*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t /*phEvent*/) { + wait_list_view & /* waitListView */, ur_event_handle_t /*phEvent*/) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -726,8 +772,8 @@ static void *getGlobalPointerFromModule(ze_module_handle_t hModule, ur_result_t ur_command_list_manager::appendDeviceGlobalVariableWrite( ur_program_handle_t hProgram, const char *name, bool blockingWrite, - size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t count, size_t offset, const void *pSrc, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY( "ur_command_list_manager::appendDeviceGlobalVariableWrite"); @@ -740,14 +786,13 @@ ur_result_t ur_command_list_manager::appendDeviceGlobalVariableWrite( // Locking is done inside appendUSMMemcpy return appendUSMMemcpy(blockingWrite, ur_cast(globalVarPtr) + offset, - pSrc, count, numEventsInWaitList, phEventWaitList, - phEvent); + pSrc, count, waitListView, phEvent); } ur_result_t ur_command_list_manager::appendDeviceGlobalVariableRead( ur_program_handle_t hProgram, const char *name, bool blockingRead, - size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t count, size_t offset, void *pDst, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY( "ur_command_list_manager::appendDeviceGlobalVariableRead"); @@ -761,32 +806,27 @@ ur_result_t ur_command_list_manager::appendDeviceGlobalVariableRead( // Locking is done inside appendUSMMemcpy return appendUSMMemcpy(blockingRead, pDst, ur_cast(globalVarPtr) + offset, count, - numEventsInWaitList, phEventWaitList, phEvent); + waitListView, phEvent); } ur_result_t ur_command_list_manager::appendReadHostPipe( ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, bool /*blocking*/, void * /*pDst*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t /*phEvent*/) { + wait_list_view & /* waitListView */, ur_event_handle_t /*phEvent*/) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } ur_result_t ur_command_list_manager::appendWriteHostPipe( ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, bool /*blocking*/, void * /*pSrc*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t /*phEvent*/) { + wait_list_view & /* waitListView */, ur_event_handle_t /*phEvent*/) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } ur_result_t ur_command_list_manager::appendUSMAllocHelper( ur_queue_t_ *Queue, ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t phEvent, ur_usm_type_t type) { + const ur_exp_async_usm_alloc_properties_t *, wait_list_view &waitListView, + void **ppMem, ur_event_handle_t phEvent, ur_usm_type_t type) { if (!pPool) { pPool = hContext->getAsyncPool(); } @@ -806,8 +846,7 @@ ur_result_t ur_command_list_manager::appendUSMAllocHelper( std::tie(*ppMem, originAllocEvent) = *asyncAlloc; } - auto waitListView = - getWaitListView(phEventWaitList, numEventsInWaitList, originAllocEvent); + waitListView.addEvent(originAllocEvent); ur_command_t commandType = UR_COMMAND_FORCE_UINT32; switch (type) { @@ -825,7 +864,7 @@ ur_result_t ur_command_list_manager::appendUSMAllocHelper( } auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto [pWaitEvents, numWaitEvents] = waitListView; + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (numWaitEvents > 0) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -844,14 +883,12 @@ ur_result_t ur_command_list_manager::appendUSMAllocHelper( ur_result_t ur_command_list_manager::appendUSMFreeExp( ur_queue_t_ *Queue, ur_usm_pool_handle_t, void *pMem, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFreeExp"); assert(phEvent); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_ENQUEUE_USM_FREE_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; umf_memory_pool_handle_t hPool = nullptr; auto umfRet = umfPoolByPtr(pMem, &hPool); @@ -896,11 +933,9 @@ ur_result_t ur_command_list_manager::bindlessImagesImageCopyExp( ur_exp_image_copy_region_t *pCopyRegion, ur_exp_image_copy_flags_t imageCopyFlags, ur_exp_image_copy_input_types_t imageCopyInputTypes, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); return bindlessImagesHandleCopyFlags( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, @@ -911,8 +946,8 @@ ur_result_t ur_command_list_manager::bindlessImagesImageCopyExp( ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, - uint64_t waitValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + uint64_t waitValue, wait_list_view &waitListView, + ur_event_handle_t phEvent) { auto hPlatform = hContext->getPlatform(); if (hPlatform->ZeExternalSemaphoreExt.Supported == false) { UR_LOG_LEGACY(ERR, @@ -923,8 +958,7 @@ ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ze_external_semaphore_wait_params_ext_t waitParams = { ZE_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WAIT_PARAMS_EXT, nullptr, 0}; @@ -941,8 +975,8 @@ ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, - uint64_t signalValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + uint64_t signalValue, wait_list_view &waitListView, + ur_event_handle_t phEvent) { auto hPlatform = hContext->getPlatform(); if (hPlatform->ZeExternalSemaphoreExt.Supported == false) { UR_LOG_LEGACY(ERR, @@ -953,8 +987,7 @@ ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ze_external_semaphore_signal_params_ext_t signalParams = { ZE_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_EXT, nullptr, 0}; @@ -973,7 +1006,7 @@ ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( ur_result_t ur_command_list_manager::appendNativeCommandExp( ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t) { + wait_list_view &, ur_event_handle_t) { UR_LOG_LEGACY( ERR, logger::LegacyMessage("[UR][L0_v2] {} function not implemented!"), "{} function not implemented!", __FUNCTION__); @@ -991,14 +1024,13 @@ ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { return zeCommandList.get(); } -ur_result_t ur_command_list_manager::appendEventsWait( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { +ur_result_t +ur_command_list_manager::appendEventsWait(wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendEventsWait"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (numWaitEvents > 0) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -1014,14 +1046,12 @@ ur_result_t ur_command_list_manager::appendEventsWait( } ur_result_t ur_command_list_manager::appendEventsWaitWithBarrier( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendEventsWaitWithBarrier"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitList; ZE2UR_CALL(zeCommandListAppendBarrier, (zeCommandList.get(), zeSignalEvent, numWaitEvents, pWaitEvents)); diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index 3c1bbd710ed47..9461fd3ccbeb2 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -14,6 +14,7 @@ #include "context.hpp" #include "event_pool_cache.hpp" #include "queue_api.hpp" +#include "ur_api.h" #include struct ur_mem_buffer_t; @@ -21,9 +22,17 @@ struct ur_mem_buffer_t; struct wait_list_view { ze_event_handle_t *handles; uint32_t num; + uint32_t max_size; - wait_list_view(ze_event_handle_t *handles, uint32_t num) - : handles(num > 0 ? handles : nullptr), num(num) {} + wait_list_view(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents); + wait_list_view(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, + ur_queue_t_ *currentBatchedQueue); + + void init(uint32_t numWaitEvents); + + void setHandles(const ur_event_handle_t *phWaitEvents); + + void addEvent(ur_event_handle_t Event); operator bool() const { assert((handles != nullptr) == (num > 0)); @@ -54,127 +63,110 @@ struct ur_command_list_manager { ur_result_t releaseSubmittedKernels(); /************ Generic queue methods *************/ - ur_result_t appendEventsWait(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + ur_result_t appendEventsWait(wait_list_view &waitListView, ur_event_handle_t phEvent); - ur_result_t - appendEventsWaitWithBarrier(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + ur_result_t appendEventsWaitWithBarrier(wait_list_view &waitList, + ur_event_handle_t phEvent); ur_result_t appendMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferReadRect( ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, - size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + size_t hostSlicePitch, void *pDst, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendMemBufferWriteRect( ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferCopy(ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferCopyRect( ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + size_t dstRowPitch, size_t dstSlicePitch, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + size_t size, wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemImageRead(ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, - ur_rect_region_t region, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + ur_rect_region_t region, wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, size_t offset, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + size_t size, wait_list_view &waitListView, ur_event_handle_t phEvent, void **ppRetMap); ur_result_t appendMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMFill(void *pMem, size_t patternSize, const void *pPattern, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMMemcpy(bool blocking, void *pDst, const void *pSrc, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + size_t size, wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMFill2D(void *, size_t, size_t, const void *, size_t, - size_t, uint32_t, const ur_event_handle_t *, - ur_event_handle_t); + size_t, wait_list_view &, ur_event_handle_t); ur_result_t appendUSMMemcpy2D(bool, void *, size_t, const void *, size_t, - size_t, size_t, uint32_t, - const ur_event_handle_t *, ur_event_handle_t); + size_t, size_t, wait_list_view &, + ur_event_handle_t); ur_result_t appendUSMPrefetch(const void *pMem, size_t size, ur_usm_migration_flags_t flags, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); - ur_result_t appendDeviceGlobalVariableWrite( - ur_program_handle_t hProgram, const char *name, bool blockingWrite, - size_t count, size_t offset, const void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); - ur_result_t appendDeviceGlobalVariableRead( - ur_program_handle_t hProgram, const char *name, bool blockingRead, - size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t appendDeviceGlobalVariableWrite(ur_program_handle_t hProgram, + const char *name, + bool blockingWrite, size_t count, + size_t offset, const void *pSrc, + wait_list_view &waitListView, + ur_event_handle_t phEvent); + ur_result_t appendDeviceGlobalVariableRead(ur_program_handle_t hProgram, + const char *name, + bool blockingRead, size_t count, + size_t offset, void *pDst, + wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendReadHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pDst, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendWriteHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pSrc, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t bindlessImagesImageCopyExp( const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, @@ -184,96 +176,90 @@ struct ur_command_list_manager { ur_exp_image_copy_region_t *pCopyRegion, ur_exp_image_copy_flags_t imageCopyFlags, ur_exp_image_copy_input_types_t imageCopyInputTypes, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t bindlessImagesWaitExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, - uint64_t waitValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + uint64_t waitValue, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t bindlessImagesSignalExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, - uint64_t signalValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + uint64_t signalValue, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendCooperativeKernelLaunchExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); - ur_result_t - appendTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + ur_result_t appendTimestampRecordingExp(bool blocking, + wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t); + wait_list_view &, ur_event_handle_t); ur_result_t appendUSMAllocHelper( ur_queue_t_ *Queue, ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t phEvent, ur_usm_type_t type); + const ur_exp_async_usm_alloc_properties_t *, wait_list_view &waitListView, + void **ppMem, ur_event_handle_t phEvent, ur_usm_type_t type); ur_result_t appendUSMFreeExp(ur_queue_t_ *Queue, ur_usm_pool_handle_t, - void *pMem, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + void *pMem, wait_list_view &waitListView, ur_event_handle_t phEvent); + v2::raii::command_list_unique_handle &&releaseCommandList(); + + void replaceCommandList(v2::raii::command_list_unique_handle &&cmdlist); + private: ur_result_t appendGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent); + ur_event_handle_t phEvent, wait_list_view &waitListView, + ur_command_t callerCommand, ur_event_handle_t additionalWaitEvent); void recordSubmittedKernel(ur_kernel_handle_t hKernel); - wait_list_view - getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent = nullptr); ze_event_handle_t getSignalEvent(ur_event_handle_t hUserEvent, ur_command_t commandType); ur_result_t appendKernelLaunchUnlocked( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - bool cooperative); + const size_t *pLocalWorkSize, wait_list_view &waitListView, + ur_event_handle_t phEvent, bool cooperative); - ur_result_t appendGenericFillUnlocked( - ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType); + ur_result_t appendGenericFillUnlocked(ur_mem_buffer_t *hBuffer, size_t offset, + size_t patternSize, + const void *pPattern, size_t size, + wait_list_view &waitListView, + ur_event_handle_t phEvent, + ur_command_t commandType); - ur_result_t appendGenericCopyUnlocked( - ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, - size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent, ur_command_t commandType); + ur_result_t appendGenericCopyUnlocked(ur_mem_buffer_t *src, + ur_mem_buffer_t *dst, bool blocking, + size_t srcOffset, size_t dstOffset, + size_t size, + wait_list_view &waitListView, + ur_event_handle_t phEvent, + ur_command_t commandType); ur_result_t appendRegionCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType); + size_t dstRowPitch, size_t dstSlicePitch, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType); // Context needs to be a first member - it needs to be alive // until all other members are destroyed. diff --git a/unified-runtime/source/adapters/level_zero/v2/event.cpp b/unified-runtime/source/adapters/level_zero/v2/event.cpp index 2c3c4b9a8685c..5f3f693db0811 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include "context.hpp" @@ -123,10 +124,22 @@ void ur_event_handle_t_::setQueue(ur_queue_t_ *hQueue) { profilingData.reset(); } +void ur_event_handle_t_::setBatch(ur_event_generation_t batch_generation) { + this->batchGeneration = batch_generation; +} + void ur_event_handle_t_::setCommandType(ur_command_t commandType) { this->commandType = commandType; } +// Enqueue batch execution if the event is created by the batched queue as part +// of its current batch +void ur_event_handle_t_::onWaitListUse() { + if (batchGeneration) { + hQueue->onEventWaitListUse(batchGeneration.value()); + } +} + void ur_event_handle_t_::recordStartTimestamp() { // queue and device must be set before calling this assert(hQueue); @@ -149,6 +162,8 @@ void ur_event_handle_t_::reset() { if (!(flags & v2::EVENT_FLAGS_COUNTER)) { zeEventHostReset(getZeEvent()); } + + batchGeneration = std::nullopt; } ze_event_handle_t ur_event_handle_t_::getZeEvent() const { @@ -192,6 +207,10 @@ ur_event_handle_t_::getEventEndTimestampAndHandle() { ur_queue_t_ *ur_event_handle_t_::getQueue() const { return hQueue; } +std::optional ur_event_handle_t_::getBatch() const { + return batchGeneration; +} + ur_context_handle_t ur_event_handle_t_::getContext() const { return hContext; } ur_command_t ur_event_handle_t_::getCommandType() const { return commandType; } @@ -234,6 +253,7 @@ ur_result_t urEventRelease(ur_event_handle_t hEvent) try { ur_result_t urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) try { for (uint32_t i = 0; i < numEvents; ++i) { + phEventWaitList[i]->onWaitListUse(); ZE2UR_CALL(zeEventHostSynchronize, (phEventWaitList[i]->getZeEvent(), UINT64_MAX)); } diff --git a/unified-runtime/source/adapters/level_zero/v2/event.hpp b/unified-runtime/source/adapters/level_zero/v2/event.hpp index 9a31c47358947..97f8052ec839f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.hpp @@ -20,6 +20,8 @@ #include "common/ur_ref_count.hpp" #include "event_provider.hpp" +using ur_event_generation_t = int64_t; + namespace v2 { class event_pool; } @@ -68,7 +70,9 @@ struct ur_event_handle_t_ : ur_object { // Set the queue and command that this event is associated with void setQueue(ur_queue_t_ *hQueue); + void setBatch(ur_event_generation_t batch_generation); void setCommandType(ur_command_t commandType); + void onWaitListUse(); void reset(); ze_event_handle_t getZeEvent() const; @@ -98,6 +102,8 @@ struct ur_event_handle_t_ : ur_object { // Get the type of the command that this event is associated with ur_command_t getCommandType() const; + std::optional getBatch() const; + // Get the device associated with this event ur_device_handle_t getDevice() const; @@ -129,6 +135,8 @@ struct ur_event_handle_t_ : ur_object { // queue and commandType that this event is associated with, set by enqueue // commands ur_queue_t_ *hQueue = nullptr; + // std::optional holds a value for events created by batched queues + std::optional batchGeneration; ur_command_t commandType = UR_COMMAND_FORCE_UINT32; ur_device_handle_t hDevice = nullptr; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 4bd9d8fd2141e..5d730697e834c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -16,9 +16,10 @@ #pragma once +#include "queue_extensions.hpp" #include -struct ur_queue_t_ { +struct ur_queue_t_ : ur_queue_extensions { virtual ~ur_queue_t_(); virtual ur_result_t queueGetInfo(ur_queue_info_t, size_t, void *, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp new file mode 100644 index 0000000000000..1632c7c4b50bf --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp @@ -0,0 +1,1042 @@ +//===--------------- queue_batched.cpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "queue_batched.hpp" +#include "adapters/level_zero/common.hpp" +#include "command_buffer.hpp" +#include "command_list_cache.hpp" +#include "command_list_manager.hpp" +#include "event.hpp" +#include "event_pool.hpp" +#include "kernel.hpp" +#include "lockable.hpp" +#include "memory.hpp" +#include "ur.hpp" + +#include "../common/latency_tracker.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../image_common.hpp" + +#include "../program.hpp" +#include "../ur_interface_loader.hpp" +#include "ur_api.h" +#include "ze_api.h" +#include +#include +#include + +namespace v2 { + +ur_queue_batched_t::ur_queue_batched_t( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::optional index, + [[maybe_unused]] event_flags_t eventFlags, ur_queue_flags_t flags) + : regularCmdListDesc(v2::command_list_desc_t{ + true /* isInOrder*/, ordinal /* Ordinal*/, + true /* copyOffloadEnable*/, false /*isMutable*/}), + currentCmdLists( + hContext, hDevice, + /* regular command list*/ + hContext->getCommandListCache().getRegularCommandList( + hDevice->ZeDevice, regularCmdListDesc), + /* command list immediate*/ + hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index) + + ) { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::constructor"); + + // TODO common code? + if (!hContext->getPlatform()->ZeCommandListImmediateAppendExt.Supported) { + UR_LOG(ERR, "Adapter v2 is used but the current driver does not support " + "the zeCommandListImmediateAppendCommandListsExp entrypoint."); + throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + this->hContext = hContext; + this->hDevice = hDevice; + + this->flags = flags; + + eventPoolRegular = hContext->getEventPoolCache(PoolCacheType::Regular) + .borrow(hDevice->Id.value(), v2::EVENT_FLAGS_COUNTER); +} + +ur_event_handle_t ur_queue_batched_t::createEventIfRequestedRegular( + ur_event_handle_t *phEvent, ur_event_generation_t batch_generation) { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::createEventIfRequested"); + + if (phEvent == nullptr) { + return nullptr; + } + + (*phEvent) = eventPoolRegular->allocate(); + (*phEvent)->setQueue(this); + (*phEvent)->setBatch(batch_generation); + + return (*phEvent); +} + +ur_event_handle_t ur_queue_batched_t::createEventAndRetainRegular( + ur_event_handle_t *phEvent, ur_event_generation_t batch_generation) { + auto hEvent = eventPoolRegular->allocate(); + hEvent->setQueue(this); + hEvent->setBatch(batch_generation); + + if (phEvent) { + (*phEvent) = hEvent; + hEvent->retain(); + } + + return hEvent; +} + +ur_result_t batch_manager::renewRegularUnlocked( + v2::raii::command_list_unique_handle &&newRegularBatch) { + TRACK_SCOPE_LATENCY("batch_manager::renewRegularUnlocked"); + + regularGenerationNumber++; + + // save the previous regular command list for execution + runBatches.push_back(activeBatch.releaseCommandList()); + // renew the regular command list (current batch) + activeBatch.replaceCommandList( + std::forward(newRegularBatch)); + + setBatchEmpty(); + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::renewBatchUnlocked(locked &batchLocked) { + if (batchLocked->isLimitOfUsedCommandListsReached()) { + UR_CALL(queueFinishUnlocked(batchLocked)); + } + + return batchLocked->renewRegularUnlocked(getNewRegularCmdList()); +} + +ur_result_t batch_manager::enqueueCurrentBatchUnlocked() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueCurrentBatchUnlocked"); + + ze_command_list_handle_t regularList = activeBatch.getZeCommandList(); + { + TRACK_SCOPE_LATENCY( + "ur_queue_batched_t::enqueueCurrentBatchUnlocked_finalize"); + // finalize + ZE2UR_CALL(zeCommandListClose, (regularList)); + } + { + TRACK_SCOPE_LATENCY( + "ur_queue_batched_t::enqueueCurrentBatchUnlocked_runBatchAppend"); + // run batch + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (immediateList.getZeCommandList(), 1, ®ularList, nullptr, 0, + nullptr)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::onEventWaitListUse(ur_event_generation_t batch_generation) { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::onEventWaitListUse"); + + auto batchLocked = currentCmdLists.lock(); + if (batchLocked->isCurrentGeneration(batch_generation)) { + return queueFlushUnlocked(batchLocked); + } else { + return UR_RESULT_SUCCESS; + } +} + +ur_result_t ur_queue_batched_t::enqueueKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueKernelLaunch"); + auto currentRegular = currentCmdLists.lock(); + + currentRegular->markIssuedCommand(); + + UR_CALL(currentRegular->getActiveBatch().appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, waitListView, + createEventIfRequestedRegular(phEvent, + currentRegular->getCurrentGeneration()))); + + return UR_RESULT_SUCCESS; +} + +ur_result_t batch_manager::hostSynchronize() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::hostSynchronize"); + + ZE2UR_CALL(zeCommandListHostSynchronize, + (immediateList.getZeCommandList(), UINT64_MAX)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::queueFinishPoolsUnlocked() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::asyncPools"); + + hContext->getAsyncPool()->cleanupPoolsForQueue(this); + hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { + hPool->cleanupPoolsForQueue(this); + return true; + }); + + return UR_RESULT_SUCCESS; +} + +ur_result_t batch_manager::batchFinish() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::batchFinish"); + + { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::releaseSubmittedKernels"); + UR_CALL(immediateList.releaseSubmittedKernels()); + } + + { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::resetRegCmdlist"); + ZE2UR_CALL(zeCommandListReset, (activeBatch.getZeCommandList())); + } + + runBatches.clear(); + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::queueFinishUnlocked(locked &batchLocked) { + UR_CALL(batchLocked->enqueueCurrentBatchUnlocked()); + UR_CALL(batchLocked->hostSynchronize()); + + UR_CALL(queueFinishPoolsUnlocked()); + + return batchLocked->batchFinish(); +} + +ur_result_t ur_queue_batched_t::queueFinish() { + try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::queueFinish"); + // finish current batch + auto lockedBatches = currentCmdLists.lock(); + return queueFinishUnlocked(lockedBatches); + + } catch (...) { + return exceptionToResult(std::current_exception()); + } +} + +ur_queue_batched_t::~ur_queue_batched_t() { + try { + UR_CALL_THROWS(queueFinish()); + } catch (...) { + // Ignore errors during destruction + } +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferRead( + ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, + void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueMemBufferRead"); + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatches = currentCmdLists.lock(); + + lockedBatches->markIssuedCommand(); + + UR_CALL(lockedBatches->getActiveBatch().appendMemBufferRead( + hBuffer, false, offset, size, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatches->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatches)); + } + + return UR_RESULT_SUCCESS; + } catch (...) { + return exceptionToResult(std::current_exception()); + } +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferWrite( + ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, + const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueMemBufferWrite"); + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatches = currentCmdLists.lock(); + + lockedBatches->markIssuedCommand(); + + UR_CALL(lockedBatches->getActiveBatch().appendMemBufferWrite( + hBuffer, false, offset, size, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatches->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatches)); + } + + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t ur_queue_batched_t::enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendDeviceGlobalVariableWrite( + hProgram, name, false, count, offset, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendDeviceGlobalVariableRead( + hProgram, name, false, count, offset, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferFill( + ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueMemBufferFill"); + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); + +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t ur_queue_batched_t::enqueueUSMMemcpy( + bool blocking, void *pDst, const void *pSrc, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendUSMMemcpy( + false, pDst, pSrc, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMFreeExp( + ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendUSMFreeExp( + this, pPool, pMem, waitListView, + createEventAndRetainRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + return queueFlushUnlocked(lockedBatch); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferMap( + ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + void **ppRetMap) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemBufferMap( + hBuffer, false, mapFlags, offset, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + ppRetMap)); + + if (blockingMap) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemUnmap( + ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemUnmap( + hMem, pMappedPtr, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferReadRect( + ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, + size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, + void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemBufferReadRect( + hBuffer, false, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferWriteRect( + ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, + size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, + void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemBufferWriteRect( + hBuffer, false, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) { + wait_list_view emptyWaitList = wait_list_view(nullptr, 0, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAdvise( + pMem, size, advice, emptyWaitList, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueUSMMemcpy2D( + bool blocking, void *pDst, size_t dstPitch, const void *pSrc, + size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendUSMMemcpy2D( + false, pDst, dstPitch, pSrc, srcPitch, width, height, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMFill2D( + void *pMem, size_t pitch, size_t patternSize, const void *pPattern, + size_t width, size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMFill2D( + pMem, pitch, patternSize, pPattern, width, height, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueUSMPrefetch( + const void *pMem, size_t size, ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMPrefetch( + pMem, size, flags, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferCopyRect( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemBufferCopyRect( + hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueEventsWaitWithBarrier( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { + UR_CALL(lockedBatch->getActiveBatch().appendEventsWaitWithBarrier( + waitListView, createEventIfRequestedRegular( + phEvent, lockedBatch->getCurrentGeneration()))); + } else { + UR_CALL(lockedBatch->getActiveBatch().appendEventsWait( + waitListView, createEventIfRequestedRegular( + phEvent, lockedBatch->getCurrentGeneration()))); + } + + return queueFlushUnlocked(lockedBatch); +} + +ur_result_t +ur_queue_batched_t::enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendEventsWait( + waitListView, createEventIfRequestedRegular( + phEvent, lockedBatch->getCurrentGeneration()))); + + return queueFlushUnlocked(lockedBatch); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferCopy( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemBufferCopy( + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueUSMFill( + void *pMem, size_t patternSize, const void *pPattern, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMFill( + pMem, patternSize, pPattern, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueMemImageRead( + ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemImageRead( + hImage, false, origin, region, rowPitch, slicePitch, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemImageWrite( + ur_mem_handle_t hImage, bool blockingWrite, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemImageWrite( + hImage, false, origin, region, rowPitch, slicePitch, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemImageCopy( + ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemImageCopy( + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueReadHostPipe( + ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, + void *pDst, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendReadHostPipe( + hProgram, pipe_symbol, false, pDst, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueWriteHostPipe( + ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendWriteHostPipe( + hProgram, pipe_symbol, false, pSrc, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAllocHelper( + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + UR_USM_TYPE_DEVICE); +} + +ur_result_t ur_queue_batched_t::enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAllocHelper( + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + UR_USM_TYPE_SHARED); +} + +ur_result_t ur_queue_batched_t::enqueueUSMHostAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAllocHelper( + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + UR_USM_TYPE_HOST); +} + +ur_result_t ur_queue_batched_t::bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, + ur_exp_image_copy_input_types_t imageCopyInputTypes, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().bindlessImagesImageCopyExp( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, imageCopyInputTypes, + waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().bindlessImagesWaitExternalSemaphoreExp( + hSemaphore, hasWaitValue, waitValue, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().bindlessImagesSignalExternalSemaphoreExp( + hSemaphore, hasSignalValue, signalValue, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +/* +In case of queues with batched submissions, which use regular command lists +(similarly to command buffers), the start timestamp would be recorded as the +operation is submitted (event.recordStartTimestamp() in +appendTimestampRecordingExp does not use the queue but directly the device), but +the end timestamp would wait for the submission of the given regular command +list. The difference between the start and end timestamps would reflect the +delay in the batch submission, the difference between end timestamps would +reflect the actual time of execution. + +TODO +The version of timestampRecording for batched queues should be adjusted in order +to reflect the idea behind the original function +*/ + +ur_result_t ur_queue_batched_t::enqueueTimestampRecordingExp( + bool /* blocking */, uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + /* wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp( + false, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; */ +} + +ur_result_t ur_queue_batched_t::enqueueCommandBufferExp( + ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendCommandBufferExp( + hCommandBuffer, waitListView, + createEventAndRetainRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendNativeCommandExp( + pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, + waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::queueGetInfo(ur_queue_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // TODO: consider support for queue properties and size + switch ((uint32_t)propName) { // cast to avoid warnings on EXT enum values + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hContext); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hDevice); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{RefCount.getCount()}); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(flags); + case UR_QUEUE_INFO_SIZE: + case UR_QUEUE_INFO_DEVICE_DEFAULT: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_QUEUE_INFO_EMPTY: { + bool isBatchEmpty = currentCmdLists.get_no_lock()->isActiveBatchEmpty(); + if (isBatchEmpty) { + auto status = ZE_CALL_NOCHECK( + zeCommandListHostSynchronize, + (currentCmdLists.get_no_lock()->getImmediateListHandle(), 0)); + if (status == ZE_RESULT_SUCCESS) { + return ReturnValue(true); + } else if (status == ZE_RESULT_NOT_READY) { + return ReturnValue(false); + } else { + return ze2urResult(status); + } + } else { + return ReturnValue(false); + } + } + default: + UR_LOG(ERR, + "Unsupported ParamName in urQueueGetInfo: " + "ParamName=ParamName={}(0x{})", + propName, logger::toHex(propName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::queueGetNativeHandle(ur_queue_native_desc_t * /*pDesc*/, + ur_native_handle_t *phNativeQueue) { + *phNativeQueue = reinterpret_cast( + currentCmdLists.get_no_lock()->getImmediateListHandle()); + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::queueFlushUnlocked(locked &batchLocked) { + UR_CALL(batchLocked->enqueueCurrentBatchUnlocked()); + + return renewBatchUnlocked(batchLocked); +} + +ur_result_t ur_queue_batched_t::queueFlush() { + auto batchLocked = currentCmdLists.lock(); + return queueFlushUnlocked(batchLocked); +} + +} // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp new file mode 100644 index 0000000000000..4dd17127de9c2 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp @@ -0,0 +1,439 @@ +//===--------------- queue_batched.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "../common.hpp" +#include "../device.hpp" + +#include "command_list_cache.hpp" +#include "common/ur_ref_count.hpp" +#include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "memory.hpp" +#include "queue_api.hpp" + +#include "ur/ur.hpp" + +#include "command_buffer.hpp" +#include "command_list_manager.hpp" +#include "lockable.hpp" +#include "queue_immediate_in_order.hpp" +#include "ur_api.h" +#include "ze_api.h" + +/* Batched queues enable submission of operations to the driver in batches, + * therefore reducing the overhead of submitting every single operation + * individually. Similarly to command buffers in L0v2, they use regular command + * lists (later referenced as 'batches'). Operations enqueued on regular command + * lists are not executed immediately, but only after enqueueing the regular + * command list on an immediate command list. However, in contrast to command + * buffers, batched queues also handle submission of batches (regular command + * lists) instead of only collecting enqueued operations, by using an internal + * immediate command list. Command lists are managed by a batch_manager inside a + * batched queue. + * + * Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in + * ur_queue_flags_t or globally, through the environment variable + * UR_L0_FORCE_BATCHED=1. + */ + +namespace v2 { + +struct batch_manager { +private: + /* The currently active regular command list, which may be replaced in the + * command list manager, submitted for execution on the immediate command list + * and stored in the vector of submitted batches while awaiting execution + * completion + */ + ur_command_list_manager activeBatch; + // An immediate command list for submission of batches + ur_command_list_manager immediateList; + /* Submitted batches (regular command lists), stored for the completion of + * their execution. After queueFinish(), the vector is cleared - at this + * point, the destructor of command_list_handle adds the given command list to + * the command list cache, to the stack assigned to the description of the + * command list. When a new regular command list is requested after + * queueFinish(), it is popped from the available stack rather than retrieved + * through a driver call, which improves performance. + */ + std::vector runBatches; + /* The generation number of the current batch, assigned to events associated + * with operations enqueued on the given batch. It is incremented during every + * replacement of the current batch. When an event created by a batched queue + * appears in an eventWaitList, the batch assigned to the given event might + * not have been executed yet and the event might never be signalled. + * Comparing generation numbers enables determining whether the current batch + * should be submitted for execution. If the generation number of the current + * batch is higher than the number assigned to the given event, the batch + * associated with the event has already been submitted for execution and + * additional submission of the current batch is not needed. + */ + ur_event_generation_t regularGenerationNumber; + /* The limit of regular command lists stored for execution; if exceeded, the + * vector is cleared as part of queueFinish and slots are renewed. + */ + static constexpr uint64_t initialSlotsForBatches = 10; + // Whether any operation has been enqueued on the current batch + bool isEmpty = true; + +public: + batch_manager(ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandListRegular, + v2::raii::command_list_unique_handle &&commandListImmediate) + : activeBatch(context, device, + std::forward( + commandListRegular)), + immediateList(context, device, + std::forward( + commandListImmediate)), + regularGenerationNumber(0) { + runBatches.reserve(initialSlotsForBatches); + } + + ur_result_t hostSynchronize(); + + ur_result_t + renewRegularUnlocked(v2::raii::command_list_unique_handle &&newRegularBatch); + + bool isCurrentGeneration(ur_event_generation_t batch_generation) { + return batch_generation == regularGenerationNumber; + } + + ur_result_t enqueueCurrentBatchUnlocked(); + + ur_command_list_manager &getActiveBatch() { return activeBatch; } + + ur_event_generation_t getCurrentGeneration() { + return regularGenerationNumber; + } + + ur_result_t batchFinish(); + + ze_command_list_handle_t getImmediateListHandle() { + return immediateList.getZeCommandList(); + } + + ze_command_list_handle_t getRegularListHandle() { + return activeBatch.getZeCommandList(); + } + + bool isActiveBatchEmpty() { return isEmpty; } + + void markIssuedCommand() { isEmpty = false; } + + void setBatchEmpty() { isEmpty = true; } + + bool isLimitOfUsedCommandListsReached() { + return initialSlotsForBatches <= runBatches.size(); + } +}; + +struct ur_queue_batched_t : ur_object, ur_queue_t_ { +private: + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + + v2::command_list_desc_t regularCmdListDesc; + lockable currentCmdLists; + + ur_queue_flags_t flags; + + /* Regular command lists use the regular pool cache type, whereas immediate + * command lists use the immediate pool cache type. Since user-requested + * operations are enqueued on regular command lists and immediate command + * lists are only used internally by the batched queue implementation, events + * are not created for immediate command lists. + */ + v2::raii::cache_borrowed_event_pool eventPoolRegular; + + v2::raii::command_list_unique_handle getNewRegularCmdList() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::getNewRegularCmdList"); + + return hContext->getCommandListCache().getRegularCommandList( + hDevice->ZeDevice, regularCmdListDesc); + } + + ur_result_t renewBatchUnlocked(locked &batchLocked); + + ur_event_handle_t + createEventIfRequestedRegular(ur_event_handle_t *phEvent, + ur_event_generation_t generation_number); + + ur_event_handle_t + createEventAndRetainRegular(ur_event_handle_t *phEvent, + ur_event_generation_t batch_generation); + + ur_result_t queueFinishPoolsUnlocked(); + + ur_result_t queueFinishUnlocked(locked &batchLocked); + + ur_result_t queueFlushUnlocked(locked &batchLocked); + +public: + ur_queue_batched_t(ur_context_handle_t, ur_device_handle_t, uint32_t ordinal, + ze_command_queue_priority_t priority, + std::optional index, event_flags_t eventFlags, + ur_queue_flags_t flags); + + ur_result_t + onEventWaitListUse(ur_event_generation_t batch_generation) override; + + ~ur_queue_batched_t(); + + ur_result_t queueGetInfo(ur_queue_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) override; + ur_result_t queueGetNativeHandle(ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) override; + ur_result_t queueFinish() override; + ur_result_t queueFlush() override; + ur_result_t enqueueKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, + phEvent); + } + + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, + const void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferReadRect( + ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, + size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, + size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferWriteRect( + ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferCopyRect( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, + void **ppRetMap) override; + + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, + size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, + ur_exp_image_copy_input_types_t imageCopyInputTypes, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur::RefCount RefCount; +}; + +} // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index be211cb198438..50db08d7a2726 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -12,9 +12,12 @@ #include "logger/ur_logger.hpp" #include "queue_api.hpp" +#include "queue_batched.hpp" #include "queue_handle.hpp" #include "queue_immediate_in_order.hpp" +static const bool ForceBatched = getenv_tobool("UR_L0_FORCE_BATCHED"); + namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; @@ -62,11 +65,21 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, return UR_RESULT_ERROR_INVALID_DEVICE; } + TRACK_SCOPE_LATENCY("queueCreate"); + ur_queue_flags_t flags = 0; if (pProperties) { flags = pProperties->flags; } + if (ForceBatched) { + flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; + } + + // TODO remove | this is just for tests in CI + // As of 16.10.205, I still remmeber about removal + // flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; + auto zeIndex = v2::getZeIndex(pProperties); if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0) { @@ -75,6 +88,11 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + } else if (flags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { + *phQueue = ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), + zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + } else { *phQueue = ur_queue_handle_t_::create( hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp new file mode 100644 index 0000000000000..110866b0f4e85 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include "ur_api.h" + +struct ur_queue_extensions { + /* Non-batched queues don't need to perform any action + + This function is intended to be called by the event. If the event has been + created by the given queue and is associated with the current batch, this + batch should be enqueued for execution. Otherwise, the event would never be + signalled */ + virtual ur_result_t + onEventWaitListUse([[maybe_unused]] int64_t batch_generation) { + return UR_RESULT_SUCCESS; + } +}; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp index c414f79a46d71..a90bd8a27868a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp @@ -16,13 +16,15 @@ #include #include "../common.hpp" +#include "queue_batched.hpp" #include "queue_immediate_in_order.hpp" #include "queue_immediate_out_of_order.hpp" #include struct ur_queue_handle_t_ : ur::handle_base { using data_variant = std::variant; + v2::ur_queue_immediate_out_of_order_t, + v2::ur_queue_batched_t>; data_variant queue_data; static constexpr uintptr_t queue_offset = diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 1e4beaf363da0..27ef4d6a5ca1b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -105,16 +105,26 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { auto lockedCommandListManager = commandListManager.lock(); - ZE2UR_CALL(zeCommandListHostSynchronize, - (lockedCommandListManager->getZeCommandList(), UINT64_MAX)); + { + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::hostSynchronize"); + ZE2UR_CALL(zeCommandListHostSynchronize, + (lockedCommandListManager->getZeCommandList(), UINT64_MAX)); + } - hContext->getAsyncPool()->cleanupPoolsForQueue(this); - hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { - hPool->cleanupPoolsForQueue(this); - return true; - }); + { + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::asyncPools"); + hContext->getAsyncPool()->cleanupPoolsForQueue(this); + hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { + hPool->cleanupPoolsForQueue(this); + return true; + }); + } - UR_CALL(lockedCommandListManager->releaseSubmittedKernels()); + { + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::releaseSubmittedKernels"); + UR_CALL(lockedCommandListManager->releaseSubmittedKernels()); + } return UR_RESULT_SUCCESS; } @@ -142,14 +152,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( // in this queue are completed when the signal is started. However, we do // need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { return commandListManager.lock()->appendEventsWaitWithBarrier( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } else { return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 3f230861ad563..ba13ba507f11a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -60,10 +60,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_kernel_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, + numPropsInLaunchPropList, launchPropList, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -74,9 +76,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, @@ -92,9 +96,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferRead( - hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingRead, offset, size, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -104,9 +110,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferWrite( - hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingWrite, offset, size, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -117,10 +125,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferReadRect( hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numEventsInWaitList, phEventWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -131,11 +141,13 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferWriteRect( hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, @@ -144,9 +156,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferCopy( - hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -157,10 +171,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferCopyRect( hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -170,9 +186,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, + hBuffer, pPattern, patternSize, offset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -183,10 +201,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemImageRead( hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, @@ -196,10 +216,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemImageWrite( hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -208,9 +230,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemImageCopy( - hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, - phEventWaitList, + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -220,18 +244,23 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferMap( - hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), - ppRetMap); + hBuffer, blockingMap, mapFlags, offset, size, waitListView, + createEventIfRequested(eventPool.get(), phEvent, this), ppRetMap); } ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemUnmap( - hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + hMem, pMappedPtr, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -240,8 +269,10 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); return commandListManager.lock()->appendUSMFill( - pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + pMem, patternSize, pPattern, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -249,8 +280,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMMemcpy( - blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + blocking, pDst, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -259,9 +293,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t height, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMFill2D( - pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, - phEventWaitList, + pMem, pitch, patternSize, pPattern, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -271,9 +307,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMMemcpy2D( - blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -282,16 +320,21 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMPrefetch( - pMem, size, flags, numEventsInWaitList, phEventWaitList, + pMem, size, flags, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) override { + wait_list_view emptyWaitList = wait_list_view(nullptr, 0); + return commandListManager.lock()->appendUSMAdvise( - pMem, size, advice, 0, nullptr, + pMem, size, advice, emptyWaitList, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -300,9 +343,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendDeviceGlobalVariableWrite( - hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, - phEventWaitList, + hProgram, name, blockingWrite, count, offset, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -311,9 +356,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendDeviceGlobalVariableRead( - hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, - phEventWaitList, + hProgram, name, blockingRead, count, offset, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -323,9 +370,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendReadHostPipe( - hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pDst, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -335,9 +384,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendWriteHostPipe( - hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -346,9 +397,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_DEVICE); } @@ -357,9 +411,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_SHARED); } @@ -369,9 +426,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_HOST); } @@ -379,8 +439,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMFreeExp( - this, pPool, pMem, numEventsInWaitList, phEventWaitList, + this, pPool, pMem, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -394,11 +457,13 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_exp_image_copy_input_types_t imageCopyInputTypes, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->bindlessImagesImageCopyExp( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, pDstImageFormat, pCopyRegion, imageCopyFlags, imageCopyInputTypes, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesWaitExternalSemaphoreExp( @@ -406,9 +471,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->bindlessImagesWaitExternalSemaphoreExp( - hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasWaitValue, waitValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -417,9 +484,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->bindlessImagesSignalExternalSemaphoreExp( - hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasSignalValue, signalValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -427,8 +496,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendTimestampRecordingExp( - blocking, numEventsInWaitList, phEventWaitList, + blocking, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -437,8 +509,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendCommandBufferExp( - hCommandBuffer, numEventsInWaitList, phEventWaitList, + hCommandBuffer, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -448,10 +523,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_exp_enqueue_native_command_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendNativeCommandExp( pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur::RefCount RefCount; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index dae2e42f93069..83f7181f004ed 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -10,6 +10,7 @@ #include "queue_immediate_out_of_order.hpp" #include "../common/latency_tracker.hpp" +#include "command_list_manager.hpp" #include "ur.hpp" namespace v2 { @@ -153,6 +154,9 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( // commands in this queue are completed when the signal is started. However, // we do need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + bool needsRealBarrier = (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; auto barrierFn = needsRealBarrier ? &ur_command_list_manager::appendEventsWaitWithBarrier @@ -161,27 +165,30 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( auto commandListManagersLocked = commandListManagers.lock(); // Enqueue wait for the user-provider events on the first command list. - UR_CALL(commandListManagersLocked[0].appendEventsWait( - numEventsInWaitList, phEventWaitList, barrierEvents[0])); + UR_CALL(commandListManagersLocked[0].appendEventsWait(waitListView, + barrierEvents[0])); + + wait_list_view emptyWaitlist = wait_list_view(nullptr, 0); // Request barrierEvents[id] to be signaled on remaining command lists. for (size_t id = 1; id < numCommandLists; id++) { - UR_CALL(commandListManagersLocked[id].appendEventsWait(0, nullptr, + UR_CALL(commandListManagersLocked[id].appendEventsWait(emptyWaitlist, barrierEvents[id])); } // Enqueue barriers on all command lists by waiting on barrierEvents. + wait_list_view barrierEventsWaitList = + wait_list_view(barrierEvents.data(), numCommandLists); if (phEvent) { - UR_CALL( - std::invoke(barrierFn, commandListManagersLocked[0], numCommandLists, - barrierEvents.data(), - createEventIfRequested(eventPool.get(), phEvent, this))); + UR_CALL(std::invoke( + barrierFn, commandListManagersLocked[0], barrierEventsWaitList, + createEventIfRequested(eventPool.get(), phEvent, this))); } for (size_t id = phEvent ? 1 : 0; id < numCommandLists; id++) { UR_CALL(std::invoke(barrierFn, commandListManagersLocked[0], - numCommandLists, barrierEvents.data(), nullptr)); + barrierEventsWaitList, nullptr)); } return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index f1ad68a62a1a8..d6ad83a78512f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -1,4 +1,4 @@ -//===--------- queue_immediate_in_order.hpp - Level Zero Adapter ---------===// +//===------- queue_immediate_out_of_order.hpp - Level Zero Adapter --------===// // // Copyright (C) 2025 Intel Corporation // @@ -73,11 +73,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_kernel_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, + numPropsInLaunchPropList, launchPropList, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -88,10 +90,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendEventsWait( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, @@ -107,10 +111,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferRead( - hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingRead, offset, size, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -120,10 +126,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferWrite( - hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingWrite, offset, size, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -134,11 +142,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferReadRect( hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numEventsInWaitList, phEventWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -149,12 +159,14 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferWriteRect( hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, @@ -163,10 +175,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferCopy( - hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -177,11 +191,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferCopyRect( hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -191,10 +207,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, + hBuffer, pPattern, patternSize, offset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -205,11 +223,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemImageRead( hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, @@ -219,11 +239,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemImageWrite( hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -232,10 +254,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemImageCopy( - hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, - phEventWaitList, + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -245,20 +269,25 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferMap( - hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), - ppRetMap); + hBuffer, blockingMap, mapFlags, offset, size, waitListView, + createEventIfRequested(eventPool.get(), phEvent, this), ppRetMap); } ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemUnmap( - hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + hMem, pMappedPtr, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -267,9 +296,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMFill( - pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + pMem, patternSize, pPattern, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -277,9 +309,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMMemcpy( - blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + blocking, pDst, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -288,10 +323,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t height, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMFill2D( - pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, - phEventWaitList, + pMem, pitch, patternSize, pPattern, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -301,10 +338,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMMemcpy2D( - blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -313,18 +352,23 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMPrefetch( - pMem, size, flags, numEventsInWaitList, phEventWaitList, + pMem, size, flags, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) override { + wait_list_view emptyWaitList = wait_list_view(nullptr, 0); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAdvise( - pMem, size, advice, 0, nullptr, + pMem, size, advice, emptyWaitList, /* 0, nullptr, */ createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -333,11 +377,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .appendDeviceGlobalVariableWrite( - hProgram, name, blockingWrite, count, offset, pSrc, - numEventsInWaitList, phEventWaitList, + hProgram, name, blockingWrite, count, offset, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -346,11 +392,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .appendDeviceGlobalVariableRead( - hProgram, name, blockingRead, count, offset, pDst, - numEventsInWaitList, phEventWaitList, + hProgram, name, blockingRead, count, offset, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -360,10 +408,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendReadHostPipe( - hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pDst, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -373,10 +423,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendWriteHostPipe( - hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -385,10 +437,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_DEVICE); } @@ -397,10 +452,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_SHARED); } @@ -410,10 +468,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_HOST); } @@ -421,9 +482,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMFreeExp( - this, pPool, pMem, numEventsInWaitList, phEventWaitList, + this, pPool, pMem, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -437,12 +501,14 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_exp_image_copy_input_types_t imageCopyInputTypes, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].bindlessImagesImageCopyExp( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, pDstImageFormat, pCopyRegion, imageCopyFlags, imageCopyInputTypes, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesWaitExternalSemaphoreExp( @@ -450,11 +516,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .bindlessImagesWaitExternalSemaphoreExp( - hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasWaitValue, waitValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -463,11 +531,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .bindlessImagesSignalExternalSemaphoreExp( - hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasSignalValue, signalValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -475,10 +545,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .appendTimestampRecordingExp( - blocking, numEventsInWaitList, phEventWaitList, + blocking, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -487,9 +560,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendCommandBufferExp( - hCommandBuffer, numEventsInWaitList, phEventWaitList, + hCommandBuffer, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -499,11 +575,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_exp_enqueue_native_command_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendNativeCommandExp( pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur::RefCount RefCount; diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp index 5de7037519490..0fe8311d83280 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.cpp +++ b/unified-runtime/source/adapters/native_cpu/queue.cpp @@ -21,6 +21,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + ur_queue_flags_t flags = 0; switch (propName) { case UR_QUEUE_INFO_CONTEXT: @@ -31,6 +32,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, return ReturnValue(hQueue->getReferenceCount()); case UR_QUEUE_INFO_EMPTY: return ReturnValue(hQueue->isEmpty()); + case UR_QUEUE_INFO_FLAGS: + /* + Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the + enqueueTimestampRecording tests after introducing batched queues, since + batched queues do not support enqueueTimestampRecording. + */ + if (!hQueue->isInOrder()) { + flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + } + if (hQueue->isProfiling()) { + flags |= UR_QUEUE_FLAG_PROFILING_ENABLE; + } + + return ReturnValue(flags); default: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp index 5f65044f66ad9..3a5657c7217cf 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -10,6 +10,16 @@ struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); + + ur_queue_flags_t queueFlags{}; + ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, + sizeof(ur_queue_flags_t), &queueFlags, + nullptr)); + + if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); + } + bool timestamp_recording_support = false; ASSERT_SUCCESS( uur::GetTimestampRecordingSupport(device, timestamp_recording_support)); diff --git a/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp b/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp index 05b1a0f76ff0c..eeb4bb2530bdb 100644 --- a/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp +++ b/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp @@ -125,8 +125,6 @@ TEST_P(urQueueGetInfoTest, SuccessRoundtripNullDevice) { } TEST_P(urQueueGetInfoTest, SuccessFlags) { - UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); - size_t property_size = 0; const ur_queue_info_t property_name = UR_QUEUE_INFO_FLAGS; From c79c8ea4e11a443d86242db5e90a61fce28526a1 Mon Sep 17 00:00:00 2001 From: Agata Momot Date: Fri, 17 Oct 2025 02:45:37 +0000 Subject: [PATCH 2/3] for tests in CI --- .../source/adapters/level_zero/v2/queue_create.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index 50db08d7a2726..16138342dcdb3 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -77,8 +77,8 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, } // TODO remove | this is just for tests in CI - // As of 16.10.205, I still remmeber about removal - // flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; + // As of 17.10.205, I still remmeber about removal + flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; auto zeIndex = v2::getZeIndex(pProperties); From a034dfffbf91dd073118ffdfa0c30dee42f63bef Mon Sep 17 00:00:00 2001 From: Agata Momot Date: Fri, 17 Oct 2025 15:07:05 +0000 Subject: [PATCH 3/3] add batch queues workarounds and skips --- .../level_zero/v2/command_list_manager.cpp | 50 +++++----- .../source/adapters/level_zero/v2/event.hpp | 7 +- .../adapters/level_zero/v2/queue_batched.cpp | 50 +++++----- .../adapters/level_zero/v2/queue_batched.hpp | 91 +++++++++---------- .../level_zero/v2/queue_extensions.hpp | 12 +-- .../source/adapters/native_cpu/queue.cpp | 8 +- .../level_zero/v2/command_list_cache_test.cpp | 12 ++- .../level_zero/v2/event_pool_test.cpp | 2 + .../enqueue/urEnqueueTimestampRecording.cpp | 12 +-- .../conformance/testing/include/uur/utils.h | 24 +++++ 10 files changed, 147 insertions(+), 121 deletions(-) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index ec350e4ea98c7..0f08c7047426d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -19,31 +19,31 @@ thread_local std::vector waitList; -/* -The wait_list_view is a wrapper for eventsWaitLists, which: - - enables passing a ze_event_handle_t buffer created from events as an -argument for the driver API; - - handles enqueueing operations associated with given events if these -operations have not already been set for execution. - -Previously, it only stored the waitlist and the corresponding event count in a -single container. Currently, the constructor also ensures that all associated -operations will eventually be executed, which is required for batched queues in -L0v2. - -Wait events might have been created in batched queues, which use regular -command lists (batches). Since regular command lists are not executed -immediately, but only after enqueueing on immediate lists, it is necessary to -enqueue the regular command list associated with the given event. Otherwise, the -event would never be signalled. The enqueueing is performed in onWaitListView(). - -In the case of batched queues, the function onWaitListView() is not called if -the current queue created the given event. The operation associated with the -given wait_list_view is added to the current batch of the queue. The entire -batch is then enqueued for execution, i.e., as part of queueFinish or -queueFlush. For the same queue, events from the given eventsWaitList are -enqueued before the associated operation is executed. -*/ +// The wait_list_view is a wrapper for eventsWaitLists, which: +// - enables passing a ze_event_handle_t buffer created from events as an +// argument for the driver API; +// - handles enqueueing operations associated with given events if these +// operations have not already been set for execution. +// +// Previously, it only stored the waitlist and the corresponding event count in +// a single container. Currently, the constructor also ensures that all +// associated operations will eventually be executed, which is required for +// batched queues in L0v2. +// +// Wait events might have been created in batched queues, which use regular +// command lists (batches). Since regular command lists are not executed +// immediately, but only after enqueueing on immediate lists, it is necessary to +// enqueue the regular command list associated with the given event. Otherwise, +// the event would never be signalled. The enqueueing is performed in +// onWaitListView(). +// +// In the case of batched queues, the function onWaitListView() is not called if +// the current queue created the given event. The operation associated with the +// given wait_list_view is added to the current batch of the queue. The entire +// batch is then enqueued for execution, i.e., as part of queueFinish or +// queueFlush. For the same queue, events from the given eventsWaitList are +// enqueued before the associated operation is executed. + template void getZeHandlesBuffer(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, diff --git a/unified-runtime/source/adapters/level_zero/v2/event.hpp b/unified-runtime/source/adapters/level_zero/v2/event.hpp index 97f8052ec839f..d22d977bc578b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.hpp @@ -70,8 +70,13 @@ struct ur_event_handle_t_ : ur_object { // Set the queue and command that this event is associated with void setQueue(ur_queue_t_ *hQueue); - void setBatch(ur_event_generation_t batch_generation); void setCommandType(ur_command_t commandType); + + // For batched queues + // Set the batch that this event is associated with + void setBatch(ur_event_generation_t batch_generation); + // Ensure that the batch associated with this event is submitted for + // execution, otherwise the event will never be signalled void onWaitListUse(); void reset(); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp index 1632c7c4b50bf..1e7bdd32622de 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp @@ -898,20 +898,18 @@ ur_result_t ur_queue_batched_t::bindlessImagesSignalExternalSemaphoreExp( lockedBatch->getCurrentGeneration())); } -/* -In case of queues with batched submissions, which use regular command lists -(similarly to command buffers), the start timestamp would be recorded as the -operation is submitted (event.recordStartTimestamp() in -appendTimestampRecordingExp does not use the queue but directly the device), but -the end timestamp would wait for the submission of the given regular command -list. The difference between the start and end timestamps would reflect the -delay in the batch submission, the difference between end timestamps would -reflect the actual time of execution. - -TODO -The version of timestampRecording for batched queues should be adjusted in order -to reflect the idea behind the original function -*/ +// In case of queues with batched submissions, which use regular command lists +// (similarly to command buffers), the start timestamp would be recorded as the +// operation is submitted (event.recordStartTimestamp() in +// appendTimestampRecordingExp does not use the queue but directly the device), +// but the end timestamp would wait for the submission of the given regular +// command list. The difference between the start and end timestamps would +// reflect the delay in the batch submission, the difference between end +// timestamps would reflect the actual time of execution. +// +// TODO +// The version of timestampRecording for batched queues should be adjusted in +// order to reflect the idea behind the original function ur_result_t ur_queue_batched_t::enqueueTimestampRecordingExp( bool /* blocking */, uint32_t /* numEventsInWaitList */, @@ -919,23 +917,23 @@ ur_result_t ur_queue_batched_t::enqueueTimestampRecordingExp( ur_event_handle_t * /* phEvent */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; - /* wait_list_view waitListView = - wait_list_view(phEventWaitList, numEventsInWaitList, this); + // wait_list_view waitListView = + // wait_list_view(phEventWaitList, numEventsInWaitList, this); - auto lockedBatch = currentCmdLists.lock(); + // auto lockedBatch = currentCmdLists.lock(); - lockedBatch->markIssuedCommand(); + // lockedBatch->markIssuedCommand(); - UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp( - false, waitListView, - createEventIfRequestedRegular(phEvent, - lockedBatch->getCurrentGeneration()))); + // UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp( + // false, waitListView, + // createEventIfRequestedRegular(phEvent, + // lockedBatch->getCurrentGeneration()))); - if (blocking) { - UR_CALL(queueFinishUnlocked(lockedBatch)); - } + // if (blocking) { + // UR_CALL(queueFinishUnlocked(lockedBatch)); + // } - return UR_RESULT_SUCCESS; */ + // return UR_RESULT_SUCCESS; } ur_result_t ur_queue_batched_t::enqueueCommandBufferExp( diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp index 4dd17127de9c2..a2964d28f567c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp @@ -30,58 +30,53 @@ #include "ur_api.h" #include "ze_api.h" -/* Batched queues enable submission of operations to the driver in batches, - * therefore reducing the overhead of submitting every single operation - * individually. Similarly to command buffers in L0v2, they use regular command - * lists (later referenced as 'batches'). Operations enqueued on regular command - * lists are not executed immediately, but only after enqueueing the regular - * command list on an immediate command list. However, in contrast to command - * buffers, batched queues also handle submission of batches (regular command - * lists) instead of only collecting enqueued operations, by using an internal - * immediate command list. Command lists are managed by a batch_manager inside a - * batched queue. - * - * Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in - * ur_queue_flags_t or globally, through the environment variable - * UR_L0_FORCE_BATCHED=1. - */ +// Batched queues enable submission of operations to the driver in batches, +// therefore reducing the overhead of submitting every single operation +// individually. Similarly to command buffers in L0v2, they use regular command +// lists (later referenced as 'batches'). Operations enqueued on regular command +// lists are not executed immediately, but only after enqueueing the regular +// command list on an immediate command list. However, in contrast to command +// buffers, batched queues also handle submission of batches (regular command +// lists) instead of only collecting enqueued operations, by using an internal +// immediate command list. Command lists are managed by a batch_manager inside a +// batched queue. +// +// Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in +// ur_queue_flags_t or globally, through the environment variable +// UR_L0_FORCE_BATCHED=1. namespace v2 { struct batch_manager { private: - /* The currently active regular command list, which may be replaced in the - * command list manager, submitted for execution on the immediate command list - * and stored in the vector of submitted batches while awaiting execution - * completion - */ + // The currently active regular command list, which may be replaced in the + // command list manager, submitted for execution on the immediate command list + // and stored in the vector of submitted batches while awaiting execution + // completion ur_command_list_manager activeBatch; // An immediate command list for submission of batches ur_command_list_manager immediateList; - /* Submitted batches (regular command lists), stored for the completion of - * their execution. After queueFinish(), the vector is cleared - at this - * point, the destructor of command_list_handle adds the given command list to - * the command list cache, to the stack assigned to the description of the - * command list. When a new regular command list is requested after - * queueFinish(), it is popped from the available stack rather than retrieved - * through a driver call, which improves performance. - */ + // Submitted batches (regular command lists), stored for the completion of + // their execution. After queueFinish(), the vector is cleared - at this + // point, the destructor of command_list_handle adds the given command list to + // the command list cache, to the stack assigned to the description of the + // command list. When a new regular command list is requested after + // queueFinish(), it is popped from the available stack rather than retrieved + // through a driver call, which improves performance. std::vector runBatches; - /* The generation number of the current batch, assigned to events associated - * with operations enqueued on the given batch. It is incremented during every - * replacement of the current batch. When an event created by a batched queue - * appears in an eventWaitList, the batch assigned to the given event might - * not have been executed yet and the event might never be signalled. - * Comparing generation numbers enables determining whether the current batch - * should be submitted for execution. If the generation number of the current - * batch is higher than the number assigned to the given event, the batch - * associated with the event has already been submitted for execution and - * additional submission of the current batch is not needed. - */ + // The generation number of the current batch, assigned to events associated + // with operations enqueued on the given batch. It is incremented during every + // replacement of the current batch. When an event created by a batched queue + // appears in an eventWaitList, the batch assigned to the given event might + // not have been executed yet and the event might never be signalled. + // Comparing generation numbers enables determining whether the current batch + // should be submitted for execution. If the generation number of the current + // batch is higher than the number assigned to the given event, the batch + // associated with the event has already been submitted for execution and + // additional submission of the current batch is not needed. ur_event_generation_t regularGenerationNumber; - /* The limit of regular command lists stored for execution; if exceeded, the - * vector is cleared as part of queueFinish and slots are renewed. - */ + // The limit of regular command lists stored for execution; if exceeded, the + // vector is cleared as part of queueFinish and slots are renewed. static constexpr uint64_t initialSlotsForBatches = 10; // Whether any operation has been enqueued on the current batch bool isEmpty = true; @@ -148,12 +143,12 @@ struct ur_queue_batched_t : ur_object, ur_queue_t_ { ur_queue_flags_t flags; - /* Regular command lists use the regular pool cache type, whereas immediate - * command lists use the immediate pool cache type. Since user-requested - * operations are enqueued on regular command lists and immediate command - * lists are only used internally by the batched queue implementation, events - * are not created for immediate command lists. - */ + // Regular command lists use the regular pool cache type, whereas immediate + // command lists use the immediate pool cache type. Since user-requested + // operations are enqueued on regular command lists and immediate command + // lists are only used internally by the batched queue implementation, events + // are not created for immediate command lists. + v2::raii::cache_borrowed_event_pool eventPoolRegular; v2::raii::command_list_unique_handle getNewRegularCmdList() { diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp index 110866b0f4e85..bd7fd23ab6721 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp @@ -9,12 +9,12 @@ #include "ur_api.h" struct ur_queue_extensions { - /* Non-batched queues don't need to perform any action - - This function is intended to be called by the event. If the event has been - created by the given queue and is associated with the current batch, this - batch should be enqueued for execution. Otherwise, the event would never be - signalled */ + // Non-batched queues don't need to perform any action + // + // This function is intended to be called by the event. If the event has been + // created by the given queue and is associated with the current batch, this + // batch should be enqueued for execution. Otherwise, the event would never be + // signalled virtual ur_result_t onEventWaitListUse([[maybe_unused]] int64_t batch_generation) { return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp index 0fe8311d83280..8ace8306088a2 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.cpp +++ b/unified-runtime/source/adapters/native_cpu/queue.cpp @@ -33,11 +33,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, case UR_QUEUE_INFO_EMPTY: return ReturnValue(hQueue->isEmpty()); case UR_QUEUE_INFO_FLAGS: - /* - Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the - enqueueTimestampRecording tests after introducing batched queues, since - batched queues do not support enqueueTimestampRecording. - */ + // Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the + // enqueueTimestampRecording tests after introducing batched queues, since + // batched queues do not support enqueueTimestampRecording. if (!hQueue->isInOrder()) { flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; } diff --git a/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp b/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp index 488165e0761a8..8ff58c991bd01 100644 --- a/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp @@ -15,6 +15,7 @@ #include "uur/fixtures.h" #include "uur/raii.h" +#include "uur/utils.h" #include #include @@ -186,6 +187,7 @@ TEST_P(CommandListCacheTest, ImmediateCommandListsHaveProperAttributes) { TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { static constexpr int NumQueuesPerType = 5; size_t NumUniqueQueueTypes = 0; + bool isBatched = false; for (int I = 0; I < NumQueuesPerType; I++) { NumUniqueQueueTypes = 0; @@ -216,6 +218,8 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { ASSERT_EQ(urQueueCreate(context, device, &QueueProps, Queue.ptr()), UR_RESULT_SUCCESS); + ASSERT_NO_FATAL_FAILURE(uur::isQueueBatched(Queue, &isBatched)); + Queues.emplace_back(Queue); } } @@ -227,7 +231,13 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { ASSERT_EQ(context->getCommandListCache().getNumImmediateCommandLists(), NumUniqueQueueTypes); - ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0); + + if (isBatched) { + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), + NumUniqueQueueTypes); + } else { + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0); + } } } diff --git a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp index 2de31b830895a..a90959dc2eef1 100644 --- a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp @@ -24,6 +24,7 @@ #include "event_provider_counter.hpp" #include "event_provider_normal.hpp" #include "queue_handle.hpp" +#include "uur/checks.h" #include "uur/fixtures.h" #include "ze_api.h" @@ -277,6 +278,7 @@ TEST_P(EventPoolTestWithQueue, WithTimestamp) { GTEST_SKIP() << "Profiling needs to be enabled"; } + SKIP_IF_BATCHED_QUEUE(queue); auto zeEvent = createZeEvent(context, device); ur_event_handle_t hEvent; diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp index 3a5657c7217cf..e0936cc795cc1 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -4,6 +4,8 @@ // // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "uur/checks.h" +#include #include #include @@ -11,15 +13,7 @@ struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); - ur_queue_flags_t queueFlags{}; - ASSERT_SUCCESS(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, - sizeof(ur_queue_flags_t), &queueFlags, - nullptr)); - - if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { - UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); - } - + SKIP_IF_BATCHED_QUEUE(queue); bool timestamp_recording_support = false; ASSERT_SUCCESS( uur::GetTimestampRecordingSupport(device, timestamp_recording_support)); diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h index 179d8d583efe8..199ce2d3ed05d 100644 --- a/unified-runtime/test/conformance/testing/include/uur/utils.h +++ b/unified-runtime/test/conformance/testing/include/uur/utils.h @@ -489,6 +489,30 @@ getDriverVersion(ur_device_handle_t hDevice) { } \ } while (0) +#define SKIP_IF_BATCHED_QUEUE(queue) \ + do { \ + ur_queue_flags_t queueFlags{}; \ + ASSERT_EQ(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, \ + sizeof(ur_queue_flags_t), &queueFlags, nullptr), \ + UR_RESULT_SUCCESS); \ + \ + if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { \ + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); \ + } \ + } while (0) + +inline void isQueueBatched(ur_queue_handle_t queue, bool *info) { + ur_queue_flags_t queueFlags{}; + ASSERT_EQ(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, sizeof(ur_queue_flags_t), + &queueFlags, nullptr), + UR_RESULT_SUCCESS); + if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { + *info = true; + } else { + *info = false; + } +} + // Is this a Data Center GPU Max series (aka PVC)? // TODO: change to use // https://spec.oneapi.io/level-zero/latest/core/api.html#ze-device-ip-version-ext-t