diff --git a/unified-runtime/scripts/templates/queue_api.hpp.mako b/unified-runtime/scripts/templates/queue_api.hpp.mako index 25f53d1b79180..731fb7d1bc692 100644 --- a/unified-runtime/scripts/templates/queue_api.hpp.mako +++ b/unified-runtime/scripts/templates/queue_api.hpp.mako @@ -25,8 +25,9 @@ from templates import helper as th #pragma once #include +#include "queue_extensions.hpp" -struct ur_queue_t_ { +struct ur_queue_t_ : ur_queue_extensions { virtual ~ur_queue_t_(); %for obj in th.get_queue_related_functions(specs, n, tags): diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 54f303a7823c9..d49868a927292 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -171,6 +171,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/lockable.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_batched.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.hpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.hpp @@ -187,6 +188,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/kernel.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/memory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_batched.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index b4c2674bd3364..80e2dedb82e23 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -12,6 +12,7 @@ #include "../command_buffer_command.hpp" #include "../helpers/kernel_helpers.hpp" #include "../ur_interface_loader.hpp" +#include "command_list_manager.hpp" #include "logger/ur_logger.hpp" #include "queue_handle.hpp" @@ -323,9 +324,12 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( auto eventsWaitList = commandBuffer->getWaitListFromSyncPoints( syncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, - nullptr, numSyncPointsInWaitList, eventsWaitList, + nullptr, waitListView, commandBuffer->createEventIfRequested(retSyncPoint))); return UR_RESULT_SUCCESS; @@ -348,8 +352,11 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMMemcpy( - false, pDst, pSrc, size, numSyncPointsInWaitList, eventsWaitList, + false, pDst, pSrc, size, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -375,9 +382,12 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferCopy( - hSrcMem, hDstMem, srcOffset, dstOffset, size, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hSrcMem, hDstMem, srcOffset, dstOffset, size, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -402,9 +412,12 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferWrite( - hBuffer, false, offset, size, pSrc, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hBuffer, false, offset, size, pSrc, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -427,9 +440,12 @@ ur_result_t urCommandBufferAppendMemBufferReadExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferRead( - hBuffer, false, offset, size, pDst, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hBuffer, false, offset, size, pDst, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -456,10 +472,13 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferCopyRect( hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -486,10 +505,12 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferWriteRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numSyncPointsInWaitList, eventsWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -517,10 +538,12 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferReadRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numSyncPointsInWaitList, eventsWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -543,9 +566,12 @@ ur_result_t urCommandBufferAppendUSMFillExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMFill( - pMemory, patternSize, pPattern, size, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + pMemory, patternSize, pPattern, size, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -567,9 +593,12 @@ ur_result_t urCommandBufferAppendMemBufferFillExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, numSyncPointsInWaitList, - eventsWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); + hBuffer, pPattern, patternSize, offset, size, waitListView, + hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } catch (...) { @@ -593,8 +622,11 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMPrefetch( - pMemory, size, flags, numSyncPointsInWaitList, eventsWaitList, + pMemory, size, flags, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -617,8 +649,11 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + UR_CALL(commandListLocked->appendUSMAdvise( - pMemory, size, advice, numSyncPointsInWaitList, eventsWaitList, + pMemory, size, advice, waitListView, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; @@ -667,15 +702,19 @@ ur_result_t urCommandBufferAppendNativeCommandExp( auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( pSyncPointWaitList, numSyncPointsInWaitList); - UR_CALL(commandListLocked->appendEventsWaitWithBarrier( - numSyncPointsInWaitList, eventsWaitList, nullptr)); + wait_list_view waitListView = + wait_list_view(eventsWaitList, numSyncPointsInWaitList); + + UR_CALL( + commandListLocked->appendEventsWaitWithBarrier(waitListView, nullptr)); // Call user-defined function immediately pfnNativeCommand(pData); + wait_list_view emptyWaitList = wait_list_view(nullptr, 0); // Barrier on all commands after user defined commands. UR_CALL(commandListLocked->appendEventsWaitWithBarrier( - 0, nullptr, hCommandBuffer->createEventIfRequested(pSyncPoint))); + emptyWaitList, hCommandBuffer->createEventIfRequested(pSyncPoint))); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 142caaecc1a71..0f08c7047426d 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -17,20 +17,118 @@ #include "kernel.hpp" #include "memory.hpp" +thread_local std::vector waitList; + +// The wait_list_view is a wrapper for eventsWaitLists, which: +// - enables passing a ze_event_handle_t buffer created from events as an +// argument for the driver API; +// - handles enqueueing operations associated with given events if these +// operations have not already been set for execution. +// +// Previously, it only stored the waitlist and the corresponding event count in +// a single container. Currently, the constructor also ensures that all +// associated operations will eventually be executed, which is required for +// batched queues in L0v2. +// +// Wait events might have been created in batched queues, which use regular +// command lists (batches). Since regular command lists are not executed +// immediately, but only after enqueueing on immediate lists, it is necessary to +// enqueue the regular command list associated with the given event. Otherwise, +// the event would never be signalled. The enqueueing is performed in +// onWaitListView(). +// +// In the case of batched queues, the function onWaitListView() is not called if +// the current queue created the given event. The operation associated with the +// given wait_list_view is added to the current batch of the queue. The entire +// batch is then enqueued for execution, i.e., as part of queueFinish or +// queueFlush. For the same queue, events from the given eventsWaitList are +// enqueued before the associated operation is executed. + +template +void getZeHandlesBuffer(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents, + ur_queue_t_ *currentBatchedQueue) { + for (uint32_t i = 0; i < numWaitEvents; i++) { + // checking if the current queue has created the given event applies only + // to batched queues + if constexpr (HasBatchedQueue) { + if (currentBatchedQueue != phWaitEvents[i]->getQueue()) { + phWaitEvents[i]->onWaitListUse(); + } + } + waitList[i] = phWaitEvents[i]->getZeEvent(); + } +} + +void wait_list_view::init(uint32_t numWaitEvents) { + num = numWaitEvents; + max_size = num + 1; + + waitList.resize(max_size); +} + +void wait_list_view::setHandles(const ur_event_handle_t *phWaitEvents) { + // vector.data() does not guarantee the null being returned in case of an + // empty vector. + // Explicit handling nullptr prevents passing uninitialized buffer to the + // driver + handles = phWaitEvents == nullptr ? nullptr : waitList.data(); +} + +wait_list_view::wait_list_view(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents) { + init(numWaitEvents); + getZeHandlesBuffer(phWaitEvents, numWaitEvents, nullptr); + setHandles(phWaitEvents); +} + +wait_list_view::wait_list_view(const ur_event_handle_t *phWaitEvents, + uint32_t numWaitEvents, + ur_queue_t_ *currentBatchedQueue) { + + init(numWaitEvents); + getZeHandlesBuffer(phWaitEvents, numWaitEvents, currentBatchedQueue); + setHandles(phWaitEvents); +} + +// At most one additional event might be added after creating the given waitlist +void wait_list_view::addEvent(ur_event_handle_t Event) { + if (Event) { + if (handles) { + assert(num != max_size); + handles[num] = Event->getZeEvent(); + num++; + } else { + waitList.resize(0); + waitList.emplace_back(Event->getZeEvent()); + num++; + handles = waitList.data(); + } + } +} + ur_command_list_manager::ur_command_list_manager( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList) : hContext(context), hDevice(device), zeCommandList(std::move(commandList)) {} +v2::raii::command_list_unique_handle && +ur_command_list_manager::releaseCommandList() { + return std::move(zeCommandList); +} + +void ur_command_list_manager::replaceCommandList( + v2::raii::command_list_unique_handle &&cmdlist) { + zeCommandList = std::move(cmdlist); +} + ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_mem_buffer_t *dst, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType) { + const void *pPattern, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType) { auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pDst = ur_cast(dst->getDevicePtr( hDevice.get(), ur_mem_buffer_t::device_access_mode_t::read_only, offset, @@ -63,11 +161,9 @@ ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_result_t ur_command_list_manager::appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, - size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType) { + size_t dstOffset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType) { auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( hDevice.get(), ur_mem_buffer_t::device_access_mode_t::read_only, @@ -92,14 +188,12 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType) { + size_t dstRowPitch, size_t dstSlicePitch, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType) { auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, dstRowPitch, srcSlicePitch, dstSlicePitch); auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( hDevice.get(), ur_mem_buffer_t::device_access_mode_t::read_only, 0, @@ -121,22 +215,6 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( return UR_RESULT_SUCCESS; } -wait_list_view ur_command_list_manager::getWaitListView( - const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent) { - - uint32_t totalNumWaitEvents = - numWaitEvents + (additionalWaitEvent != nullptr ? 1 : 0); - waitList.resize(totalNumWaitEvents); - for (uint32_t i = 0; i < numWaitEvents; i++) { - waitList[i] = phWaitEvents[i]->getZeEvent(); - } - if (additionalWaitEvent != nullptr) { - waitList[totalNumWaitEvents - 1] = additionalWaitEvent->getZeEvent(); - } - return {waitList.data(), static_cast(totalNumWaitEvents)}; -} - ze_event_handle_t ur_command_list_manager::getSignalEvent(ur_event_handle_t hUserEvent, ur_command_t commandType) { @@ -151,9 +229,8 @@ ur_command_list_manager::getSignalEvent(ur_event_handle_t hUserEvent, ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - bool cooperative) { + const size_t *pLocalWorkSize, wait_list_view &waitListView, + ur_event_handle_t phEvent, bool cooperative) { UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -171,7 +248,6 @@ ur_result_t ur_command_list_manager::appendKernelLaunchUnlocked( pGlobalWorkSize, pLocalWorkSize)); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); UR_CALL(hKernel->prepareForSubmission( hContext.get(), hDevice.get(), pGlobalWorkOffset, workDim, WG[0], WG[1], @@ -203,8 +279,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); for (uint32_t propIndex = 0; propIndex < numPropsInLaunchPropList; @@ -212,10 +287,9 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( if (launchPropList[propIndex].id == UR_KERNEL_LAUNCH_PROPERTY_ID_COOPERATIVE && launchPropList[propIndex].value.cooperative) { - UR_CALL(appendKernelLaunchUnlocked(hKernel, workDim, pGlobalWorkOffset, - pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, - phEvent, true /* cooperative */)); + UR_CALL(appendKernelLaunchUnlocked( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + waitListView, phEvent, true /* cooperative */)); return UR_RESULT_SUCCESS; } if (launchPropList[propIndex].id != UR_KERNEL_LAUNCH_PROPERTY_ID_IGNORE && @@ -228,20 +302,18 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( UR_CALL(appendKernelLaunchUnlocked( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent, false /* cooperative */)); + waitListView, phEvent, false /* cooperative */)); return UR_RESULT_SUCCESS; } ur_result_t ur_command_list_manager::appendUSMMemcpy( bool blocking, void *pDst, const void *pSrc, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ZE2UR_CALL(zeCommandListAppendMemoryCopy, (zeCommandList.get(), pDst, pSrc, size, zeSignalEvent, @@ -256,8 +328,8 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy( ur_result_t ur_command_list_manager::appendMemBufferFill( ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t offset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill"); auto hBuffer = hMem->getBuffer(); @@ -266,26 +338,23 @@ ur_result_t ur_command_list_manager::appendMemBufferFill( std::scoped_lock lock(hBuffer->getMutex()); return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_FILL); + waitListView, phEvent, + UR_COMMAND_MEM_BUFFER_FILL); } ur_result_t ur_command_list_manager::appendUSMFill( void *pMem, size_t patternSize, const void *pPattern, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill"); ur_usm_handle_t dstHandle(hContext.get(), size, pMem); return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_USM_FILL); + waitListView, phEvent, UR_COMMAND_USM_FILL); } ur_result_t ur_command_list_manager::appendUSMPrefetch( const void *pMem, size_t size, ur_usm_migration_flags_t flags, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); switch (flags) { @@ -301,8 +370,7 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( } auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (pWaitEvents) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -323,15 +391,13 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( ur_result_t ur_command_list_manager::appendUSMAdvise( const void *pMem, size_t size, ur_usm_advice_flags_t advice, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); auto zeAdvice = ur_cast(advice); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (pWaitEvents) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -350,8 +416,7 @@ ur_result_t ur_command_list_manager::appendUSMAdvise( ur_result_t ur_command_list_manager::appendMemBufferRead( ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + void *pDst, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferRead"); auto hBuffer = hMem->getBuffer(); @@ -362,14 +427,13 @@ ur_result_t ur_command_list_manager::appendMemBufferRead( std::scoped_lock lock(hBuffer->getMutex()); return appendGenericCopyUnlocked(hBuffer, &dstHandle, blockingRead, offset, 0, - size, numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_READ); + size, waitListView, phEvent, + UR_COMMAND_MEM_BUFFER_READ); } ur_result_t ur_command_list_manager::appendMemBufferWrite( ur_mem_handle_t hMem, bool blockingWrite, size_t offset, size_t size, - const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + const void *pSrc, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWrite"); auto hBuffer = hMem->getBuffer(); @@ -379,15 +443,15 @@ ur_result_t ur_command_list_manager::appendMemBufferWrite( std::scoped_lock lock(hBuffer->getMutex()); - return appendGenericCopyUnlocked( - &srcHandle, hBuffer, blockingWrite, 0, offset, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE); + return appendGenericCopyUnlocked(&srcHandle, hBuffer, blockingWrite, 0, + offset, size, waitListView, phEvent, + UR_COMMAND_MEM_BUFFER_WRITE); } ur_result_t ur_command_list_manager::appendMemBufferCopy( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, size_t srcOffset, - size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t dstOffset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopy"); auto hBufferSrc = hSrc->getBuffer(); @@ -402,8 +466,7 @@ ur_result_t ur_command_list_manager::appendMemBufferCopy( hBufferSrc->getMutex(), hBufferDst->getMutex()); return appendGenericCopyUnlocked(hBufferSrc, hBufferDst, false, srcOffset, - dstOffset, size, numEventsInWaitList, - phEventWaitList, phEvent, + dstOffset, size, waitListView, phEvent, UR_COMMAND_MEM_BUFFER_COPY); } @@ -411,8 +474,7 @@ ur_result_t ur_command_list_manager::appendMemBufferReadRect( ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + void *pDst, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferReadRect"); auto hBuffer = hMem->getBuffer(); @@ -423,16 +485,14 @@ ur_result_t ur_command_list_manager::appendMemBufferReadRect( return appendRegionCopyUnlocked( hBuffer, &dstHandle, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, - numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_READ_RECT); + waitListView, phEvent, UR_COMMAND_MEM_BUFFER_READ_RECT); } ur_result_t ur_command_list_manager::appendMemBufferWriteRect( ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + void *pSrc, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWriteRect"); auto hBuffer = hMem->getBuffer(); @@ -443,16 +503,14 @@ ur_result_t ur_command_list_manager::appendMemBufferWriteRect( return appendRegionCopyUnlocked( &srcHandle, hBuffer, blockingWrite, hostOrigin, bufferOrigin, region, hostRowPitch, hostSlicePitch, bufferRowPitch, bufferSlicePitch, - numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_WRITE_RECT); + waitListView, phEvent, UR_COMMAND_MEM_BUFFER_WRITE_RECT); } ur_result_t ur_command_list_manager::appendMemBufferCopyRect( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopyRect"); auto hBufferSrc = hSrc->getBuffer(); @@ -461,16 +519,16 @@ ur_result_t ur_command_list_manager::appendMemBufferCopyRect( std::scoped_lock lock( hBufferSrc->getMutex(), hBufferDst->getMutex()); - return appendRegionCopyUnlocked( - hBufferSrc, hBufferDst, false, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY_RECT); + return appendRegionCopyUnlocked(hBufferSrc, hBufferDst, false, srcOrigin, + dstOrigin, region, srcRowPitch, srcSlicePitch, + dstRowPitch, dstSlicePitch, waitListView, + phEvent, UR_COMMAND_MEM_BUFFER_COPY_RECT); } ur_result_t ur_command_list_manager::appendUSMMemcpy2D( bool blocking, void *pDst, size_t dstPitch, const void *pSrc, - size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t srcPitch, size_t width, size_t height, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy2D"); ur_rect_offset_t zeroOffset{0, 0, 0}; @@ -481,21 +539,19 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy2D( return appendRegionCopyUnlocked(&srcHandle, &dstHandle, blocking, zeroOffset, zeroOffset, region, srcPitch, 0, dstPitch, 0, - numEventsInWaitList, phEventWaitList, phEvent, + waitListView, phEvent, UR_COMMAND_USM_MEMCPY_2D); } ur_result_t ur_command_list_manager::appendTimestampRecordingExp( - bool blocking, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + bool blocking, wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendTimestampRecordingExp"); if (!phEvent) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; } - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; phEvent->recordStartTimestamp(); @@ -515,14 +571,13 @@ ur_result_t ur_command_list_manager::appendTimestampRecordingExp( ur_result_t ur_command_list_manager::appendGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent) { + ur_event_handle_t phEvent, wait_list_view &waitListView, + ur_command_t callerCommand, ur_event_handle_t additionalWaitEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendGenericCommandListsExp"); auto zeSignalEvent = getSignalEvent(phEvent, callerCommand); - auto [pWaitEvents, numWaitEvents] = getWaitListView( - phEventWaitList, numEventsInWaitList, additionalWaitEvent); + waitListView.addEvent(additionalWaitEvent); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, (getZeCommandList(), numCommandLists, phCommandLists, @@ -532,8 +587,8 @@ ur_result_t ur_command_list_manager::appendGenericCommandListsExp( } ur_result_t ur_command_list_manager::appendCommandBufferExp( - ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + ur_exp_command_buffer_handle_t hCommandBuffer, wait_list_view &waitListView, + ur_event_handle_t phEvent) { auto bufferCommandListLocked = hCommandBuffer->commandListManager.lock(); ze_command_list_handle_t commandBufferCommandList = @@ -549,9 +604,10 @@ ur_result_t ur_command_list_manager::appendCommandBufferExp( (executionEvent->getZeEvent(), UINT64_MAX)); } - UR_CALL(appendGenericCommandListsExp( - 1, &commandBufferCommandList, phEvent, numEventsInWaitList, - phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent)); + UR_CALL(appendGenericCommandListsExp(1, &commandBufferCommandList, phEvent, + waitListView, + UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, + /* already synchronized */ nullptr)); UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(phEvent)); return UR_RESULT_SUCCESS; @@ -560,14 +616,12 @@ ur_result_t ur_command_list_manager::appendCommandBufferExp( ur_result_t ur_command_list_manager::appendMemImageRead( ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageRead"); auto hImage = hMem->getImage(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_READ); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto [zeImage, zeRegion] = hImage->getRWRegion(origin, region, rowPitch, slicePitch); @@ -586,14 +640,12 @@ ur_result_t ur_command_list_manager::appendMemImageRead( ur_result_t ur_command_list_manager::appendMemImageWrite( ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageWrite"); auto hImage = hMem->getImage(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_WRITE); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto [zeImage, zeRegion] = hImage->getRWRegion(origin, region, rowPitch, slicePitch); @@ -612,15 +664,13 @@ ur_result_t ur_command_list_manager::appendMemImageWrite( ur_result_t ur_command_list_manager::appendMemImageCopy( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemImageWrite"); auto hImageSrc = hSrc->getImage(); auto hImageDst = hDst->getImage(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto desc = ur_mem_image_t::getCopyRegions(*hImageSrc, *hImageDst, srcOrigin, dstOrigin, region); @@ -638,9 +688,8 @@ ur_result_t ur_command_list_manager::appendMemImageCopy( ur_result_t ur_command_list_manager::appendMemBufferMap( ur_mem_handle_t hMem, bool blockingMap, ur_map_flags_t mapFlags, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - void **ppRetMap) { + size_t offset, size_t size, wait_list_view &waitListView, + ur_event_handle_t phEvent, void **ppRetMap) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferMap"); auto hBuffer = hMem->getBuffer(); @@ -648,7 +697,6 @@ ur_result_t ur_command_list_manager::appendMemBufferMap( std::scoped_lock lock(hBuffer->getMutex()); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_BUFFER_MAP); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pDst = ur_cast(hBuffer->mapHostPtr( mapFlags, offset, size, zeCommandList.get(), waitListView)); @@ -672,15 +720,15 @@ ur_result_t ur_command_list_manager::appendMemBufferMap( return UR_RESULT_SUCCESS; } -ur_result_t ur_command_list_manager::appendMemUnmap( - ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { +ur_result_t +ur_command_list_manager::appendMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemUnmap"); auto hBuffer = hMem->getBuffer(); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_UNMAP); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); // TODO: currently unmapHostPtr deallocates memory immediately, // since the memory might be used by the user, we need to make sure @@ -700,9 +748,7 @@ ur_result_t ur_command_list_manager::appendMemUnmap( ur_result_t ur_command_list_manager::appendUSMFill2D( void * /*pMem*/, size_t /*pitch*/, size_t /*patternSize*/, const void * /*pPattern*/, size_t /*width*/, size_t /*height*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t /*phEvent*/) { + wait_list_view & /* waitListView */, ur_event_handle_t /*phEvent*/) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -726,8 +772,8 @@ static void *getGlobalPointerFromModule(ze_module_handle_t hModule, ur_result_t ur_command_list_manager::appendDeviceGlobalVariableWrite( ur_program_handle_t hProgram, const char *name, bool blockingWrite, - size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t count, size_t offset, const void *pSrc, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY( "ur_command_list_manager::appendDeviceGlobalVariableWrite"); @@ -740,14 +786,13 @@ ur_result_t ur_command_list_manager::appendDeviceGlobalVariableWrite( // Locking is done inside appendUSMMemcpy return appendUSMMemcpy(blockingWrite, ur_cast(globalVarPtr) + offset, - pSrc, count, numEventsInWaitList, phEventWaitList, - phEvent); + pSrc, count, waitListView, phEvent); } ur_result_t ur_command_list_manager::appendDeviceGlobalVariableRead( ur_program_handle_t hProgram, const char *name, bool blockingRead, - size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + size_t count, size_t offset, void *pDst, wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY( "ur_command_list_manager::appendDeviceGlobalVariableRead"); @@ -761,32 +806,27 @@ ur_result_t ur_command_list_manager::appendDeviceGlobalVariableRead( // Locking is done inside appendUSMMemcpy return appendUSMMemcpy(blockingRead, pDst, ur_cast(globalVarPtr) + offset, count, - numEventsInWaitList, phEventWaitList, phEvent); + waitListView, phEvent); } ur_result_t ur_command_list_manager::appendReadHostPipe( ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, bool /*blocking*/, void * /*pDst*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t /*phEvent*/) { + wait_list_view & /* waitListView */, ur_event_handle_t /*phEvent*/) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } ur_result_t ur_command_list_manager::appendWriteHostPipe( ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, bool /*blocking*/, void * /*pSrc*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t /*phEvent*/) { + wait_list_view & /* waitListView */, ur_event_handle_t /*phEvent*/) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } ur_result_t ur_command_list_manager::appendUSMAllocHelper( ur_queue_t_ *Queue, ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t phEvent, ur_usm_type_t type) { + const ur_exp_async_usm_alloc_properties_t *, wait_list_view &waitListView, + void **ppMem, ur_event_handle_t phEvent, ur_usm_type_t type) { if (!pPool) { pPool = hContext->getAsyncPool(); } @@ -806,8 +846,7 @@ ur_result_t ur_command_list_manager::appendUSMAllocHelper( std::tie(*ppMem, originAllocEvent) = *asyncAlloc; } - auto waitListView = - getWaitListView(phEventWaitList, numEventsInWaitList, originAllocEvent); + waitListView.addEvent(originAllocEvent); ur_command_t commandType = UR_COMMAND_FORCE_UINT32; switch (type) { @@ -825,7 +864,7 @@ ur_result_t ur_command_list_manager::appendUSMAllocHelper( } auto zeSignalEvent = getSignalEvent(phEvent, commandType); - auto [pWaitEvents, numWaitEvents] = waitListView; + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (numWaitEvents > 0) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -844,14 +883,12 @@ ur_result_t ur_command_list_manager::appendUSMAllocHelper( ur_result_t ur_command_list_manager::appendUSMFreeExp( ur_queue_t_ *Queue, ur_usm_pool_handle_t, void *pMem, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFreeExp"); assert(phEvent); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_ENQUEUE_USM_FREE_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; umf_memory_pool_handle_t hPool = nullptr; auto umfRet = umfPoolByPtr(pMem, &hPool); @@ -896,11 +933,9 @@ ur_result_t ur_command_list_manager::bindlessImagesImageCopyExp( ur_exp_image_copy_region_t *pCopyRegion, ur_exp_image_copy_flags_t imageCopyFlags, ur_exp_image_copy_input_types_t imageCopyInputTypes, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitListView, ur_event_handle_t phEvent) { auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); return bindlessImagesHandleCopyFlags( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, @@ -911,8 +946,8 @@ ur_result_t ur_command_list_manager::bindlessImagesImageCopyExp( ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, - uint64_t waitValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + uint64_t waitValue, wait_list_view &waitListView, + ur_event_handle_t phEvent) { auto hPlatform = hContext->getPlatform(); if (hPlatform->ZeExternalSemaphoreExt.Supported == false) { UR_LOG_LEGACY(ERR, @@ -923,8 +958,7 @@ ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ze_external_semaphore_wait_params_ext_t waitParams = { ZE_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_WAIT_PARAMS_EXT, nullptr, 0}; @@ -941,8 +975,8 @@ ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, - uint64_t signalValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent) { + uint64_t signalValue, wait_list_view &waitListView, + ur_event_handle_t phEvent) { auto hPlatform = hContext->getPlatform(); if (hPlatform->ZeExternalSemaphoreExt.Supported == false) { UR_LOG_LEGACY(ERR, @@ -953,8 +987,7 @@ ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; ze_external_semaphore_signal_params_ext_t signalParams = { ZE_STRUCTURE_TYPE_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_EXT, nullptr, 0}; @@ -973,7 +1006,7 @@ ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( ur_result_t ur_command_list_manager::appendNativeCommandExp( ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t) { + wait_list_view &, ur_event_handle_t) { UR_LOG_LEGACY( ERR, logger::LegacyMessage("[UR][L0_v2] {} function not implemented!"), "{} function not implemented!", __FUNCTION__); @@ -991,14 +1024,13 @@ ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { return zeCommandList.get(); } -ur_result_t ur_command_list_manager::appendEventsWait( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { +ur_result_t +ur_command_list_manager::appendEventsWait(wait_list_view &waitListView, + ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendEventsWait"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitListView; if (numWaitEvents > 0) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, @@ -1014,14 +1046,12 @@ ur_result_t ur_command_list_manager::appendEventsWait( } ur_result_t ur_command_list_manager::appendEventsWaitWithBarrier( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent) { + wait_list_view &waitList, ur_event_handle_t phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendEventsWaitWithBarrier"); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); + auto [pWaitEvents, numWaitEvents, _] = waitList; ZE2UR_CALL(zeCommandListAppendBarrier, (zeCommandList.get(), zeSignalEvent, numWaitEvents, pWaitEvents)); diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index 3c1bbd710ed47..9461fd3ccbeb2 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -14,6 +14,7 @@ #include "context.hpp" #include "event_pool_cache.hpp" #include "queue_api.hpp" +#include "ur_api.h" #include struct ur_mem_buffer_t; @@ -21,9 +22,17 @@ struct ur_mem_buffer_t; struct wait_list_view { ze_event_handle_t *handles; uint32_t num; + uint32_t max_size; - wait_list_view(ze_event_handle_t *handles, uint32_t num) - : handles(num > 0 ? handles : nullptr), num(num) {} + wait_list_view(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents); + wait_list_view(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, + ur_queue_t_ *currentBatchedQueue); + + void init(uint32_t numWaitEvents); + + void setHandles(const ur_event_handle_t *phWaitEvents); + + void addEvent(ur_event_handle_t Event); operator bool() const { assert((handles != nullptr) == (num > 0)); @@ -54,127 +63,110 @@ struct ur_command_list_manager { ur_result_t releaseSubmittedKernels(); /************ Generic queue methods *************/ - ur_result_t appendEventsWait(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + ur_result_t appendEventsWait(wait_list_view &waitListView, ur_event_handle_t phEvent); - ur_result_t - appendEventsWaitWithBarrier(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + ur_result_t appendEventsWaitWithBarrier(wait_list_view &waitList, + ur_event_handle_t phEvent); ur_result_t appendMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferReadRect( ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, - size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + size_t hostSlicePitch, void *pDst, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendMemBufferWriteRect( ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferCopy(ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferCopyRect( ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + size_t dstRowPitch, size_t dstSlicePitch, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + size_t size, wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemImageRead(ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, - ur_rect_region_t region, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + ur_rect_region_t region, wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, size_t offset, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + size_t size, wait_list_view &waitListView, ur_event_handle_t phEvent, void **ppRetMap); ur_result_t appendMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMFill(void *pMem, size_t patternSize, const void *pPattern, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMMemcpy(bool blocking, void *pDst, const void *pSrc, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + size_t size, wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMFill2D(void *, size_t, size_t, const void *, size_t, - size_t, uint32_t, const ur_event_handle_t *, - ur_event_handle_t); + size_t, wait_list_view &, ur_event_handle_t); ur_result_t appendUSMMemcpy2D(bool, void *, size_t, const void *, size_t, - size_t, size_t, uint32_t, - const ur_event_handle_t *, ur_event_handle_t); + size_t, size_t, wait_list_view &, + ur_event_handle_t); ur_result_t appendUSMPrefetch(const void *pMem, size_t size, ur_usm_migration_flags_t flags, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); - ur_result_t appendDeviceGlobalVariableWrite( - ur_program_handle_t hProgram, const char *name, bool blockingWrite, - size_t count, size_t offset, const void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); - ur_result_t appendDeviceGlobalVariableRead( - ur_program_handle_t hProgram, const char *name, bool blockingRead, - size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + ur_result_t appendDeviceGlobalVariableWrite(ur_program_handle_t hProgram, + const char *name, + bool blockingWrite, size_t count, + size_t offset, const void *pSrc, + wait_list_view &waitListView, + ur_event_handle_t phEvent); + ur_result_t appendDeviceGlobalVariableRead(ur_program_handle_t hProgram, + const char *name, + bool blockingRead, size_t count, + size_t offset, void *pDst, + wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendReadHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pDst, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendWriteHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pSrc, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t bindlessImagesImageCopyExp( const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, @@ -184,96 +176,90 @@ struct ur_command_list_manager { ur_exp_image_copy_region_t *pCopyRegion, ur_exp_image_copy_flags_t imageCopyFlags, ur_exp_image_copy_input_types_t imageCopyInputTypes, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t bindlessImagesWaitExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, - uint64_t waitValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + uint64_t waitValue, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t bindlessImagesSignalExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, - uint64_t signalValue, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); + uint64_t signalValue, wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendCooperativeKernelLaunchExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent); - ur_result_t - appendTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + ur_result_t appendTimestampRecordingExp(bool blocking, + wait_list_view &waitListView, + ur_event_handle_t phEvent); ur_result_t appendCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_kernel_launch_property_t *launchPropList, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent); + wait_list_view &waitListView, ur_event_handle_t phEvent); ur_result_t appendNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, uint32_t, const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t); + wait_list_view &, ur_event_handle_t); ur_result_t appendUSMAllocHelper( ur_queue_t_ *Queue, ur_usm_pool_handle_t pPool, const size_t size, - const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t phEvent, ur_usm_type_t type); + const ur_exp_async_usm_alloc_properties_t *, wait_list_view &waitListView, + void **ppMem, ur_event_handle_t phEvent, ur_usm_type_t type); ur_result_t appendUSMFreeExp(ur_queue_t_ *Queue, ur_usm_pool_handle_t, - void *pMem, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, + void *pMem, wait_list_view &waitListView, ur_event_handle_t phEvent); + v2::raii::command_list_unique_handle &&releaseCommandList(); + + void replaceCommandList(v2::raii::command_list_unique_handle &&cmdlist); + private: ur_result_t appendGenericCommandListsExp( uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent); + ur_event_handle_t phEvent, wait_list_view &waitListView, + ur_command_t callerCommand, ur_event_handle_t additionalWaitEvent); void recordSubmittedKernel(ur_kernel_handle_t hKernel); - wait_list_view - getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent = nullptr); ze_event_handle_t getSignalEvent(ur_event_handle_t hUserEvent, ur_command_t commandType); ur_result_t appendKernelLaunchUnlocked( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - bool cooperative); + const size_t *pLocalWorkSize, wait_list_view &waitListView, + ur_event_handle_t phEvent, bool cooperative); - ur_result_t appendGenericFillUnlocked( - ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType); + ur_result_t appendGenericFillUnlocked(ur_mem_buffer_t *hBuffer, size_t offset, + size_t patternSize, + const void *pPattern, size_t size, + wait_list_view &waitListView, + ur_event_handle_t phEvent, + ur_command_t commandType); - ur_result_t appendGenericCopyUnlocked( - ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, - size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t phEvent, ur_command_t commandType); + ur_result_t appendGenericCopyUnlocked(ur_mem_buffer_t *src, + ur_mem_buffer_t *dst, bool blocking, + size_t srcOffset, size_t dstOffset, + size_t size, + wait_list_view &waitListView, + ur_event_handle_t phEvent, + ur_command_t commandType); ur_result_t appendRegionCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, - size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t phEvent, - ur_command_t commandType); + size_t dstRowPitch, size_t dstSlicePitch, wait_list_view &waitListView, + ur_event_handle_t phEvent, ur_command_t commandType); // Context needs to be a first member - it needs to be alive // until all other members are destroyed. diff --git a/unified-runtime/source/adapters/level_zero/v2/event.cpp b/unified-runtime/source/adapters/level_zero/v2/event.cpp index 2c3c4b9a8685c..5f3f693db0811 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.cpp @@ -8,6 +8,7 @@ // //===----------------------------------------------------------------------===// +#include #include #include "context.hpp" @@ -123,10 +124,22 @@ void ur_event_handle_t_::setQueue(ur_queue_t_ *hQueue) { profilingData.reset(); } +void ur_event_handle_t_::setBatch(ur_event_generation_t batch_generation) { + this->batchGeneration = batch_generation; +} + void ur_event_handle_t_::setCommandType(ur_command_t commandType) { this->commandType = commandType; } +// Enqueue batch execution if the event is created by the batched queue as part +// of its current batch +void ur_event_handle_t_::onWaitListUse() { + if (batchGeneration) { + hQueue->onEventWaitListUse(batchGeneration.value()); + } +} + void ur_event_handle_t_::recordStartTimestamp() { // queue and device must be set before calling this assert(hQueue); @@ -149,6 +162,8 @@ void ur_event_handle_t_::reset() { if (!(flags & v2::EVENT_FLAGS_COUNTER)) { zeEventHostReset(getZeEvent()); } + + batchGeneration = std::nullopt; } ze_event_handle_t ur_event_handle_t_::getZeEvent() const { @@ -192,6 +207,10 @@ ur_event_handle_t_::getEventEndTimestampAndHandle() { ur_queue_t_ *ur_event_handle_t_::getQueue() const { return hQueue; } +std::optional ur_event_handle_t_::getBatch() const { + return batchGeneration; +} + ur_context_handle_t ur_event_handle_t_::getContext() const { return hContext; } ur_command_t ur_event_handle_t_::getCommandType() const { return commandType; } @@ -234,6 +253,7 @@ ur_result_t urEventRelease(ur_event_handle_t hEvent) try { ur_result_t urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) try { for (uint32_t i = 0; i < numEvents; ++i) { + phEventWaitList[i]->onWaitListUse(); ZE2UR_CALL(zeEventHostSynchronize, (phEventWaitList[i]->getZeEvent(), UINT64_MAX)); } diff --git a/unified-runtime/source/adapters/level_zero/v2/event.hpp b/unified-runtime/source/adapters/level_zero/v2/event.hpp index 9a31c47358947..d22d977bc578b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/event.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/event.hpp @@ -20,6 +20,8 @@ #include "common/ur_ref_count.hpp" #include "event_provider.hpp" +using ur_event_generation_t = int64_t; + namespace v2 { class event_pool; } @@ -70,6 +72,13 @@ struct ur_event_handle_t_ : ur_object { void setQueue(ur_queue_t_ *hQueue); void setCommandType(ur_command_t commandType); + // For batched queues + // Set the batch that this event is associated with + void setBatch(ur_event_generation_t batch_generation); + // Ensure that the batch associated with this event is submitted for + // execution, otherwise the event will never be signalled + void onWaitListUse(); + void reset(); ze_event_handle_t getZeEvent() const; @@ -98,6 +107,8 @@ struct ur_event_handle_t_ : ur_object { // Get the type of the command that this event is associated with ur_command_t getCommandType() const; + std::optional getBatch() const; + // Get the device associated with this event ur_device_handle_t getDevice() const; @@ -129,6 +140,8 @@ struct ur_event_handle_t_ : ur_object { // queue and commandType that this event is associated with, set by enqueue // commands ur_queue_t_ *hQueue = nullptr; + // std::optional holds a value for events created by batched queues + std::optional batchGeneration; ur_command_t commandType = UR_COMMAND_FORCE_UINT32; ur_device_handle_t hDevice = nullptr; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp index 4bd9d8fd2141e..5d730697e834c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.hpp @@ -16,9 +16,10 @@ #pragma once +#include "queue_extensions.hpp" #include -struct ur_queue_t_ { +struct ur_queue_t_ : ur_queue_extensions { virtual ~ur_queue_t_(); virtual ur_result_t queueGetInfo(ur_queue_info_t, size_t, void *, diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp new file mode 100644 index 0000000000000..1e7bdd32622de --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.cpp @@ -0,0 +1,1040 @@ +//===--------------- queue_batched.cpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "queue_batched.hpp" +#include "adapters/level_zero/common.hpp" +#include "command_buffer.hpp" +#include "command_list_cache.hpp" +#include "command_list_manager.hpp" +#include "event.hpp" +#include "event_pool.hpp" +#include "kernel.hpp" +#include "lockable.hpp" +#include "memory.hpp" +#include "ur.hpp" + +#include "../common/latency_tracker.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../image_common.hpp" + +#include "../program.hpp" +#include "../ur_interface_loader.hpp" +#include "ur_api.h" +#include "ze_api.h" +#include +#include +#include + +namespace v2 { + +ur_queue_batched_t::ur_queue_batched_t( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::optional index, + [[maybe_unused]] event_flags_t eventFlags, ur_queue_flags_t flags) + : regularCmdListDesc(v2::command_list_desc_t{ + true /* isInOrder*/, ordinal /* Ordinal*/, + true /* copyOffloadEnable*/, false /*isMutable*/}), + currentCmdLists( + hContext, hDevice, + /* regular command list*/ + hContext->getCommandListCache().getRegularCommandList( + hDevice->ZeDevice, regularCmdListDesc), + /* command list immediate*/ + hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index) + + ) { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::constructor"); + + // TODO common code? + if (!hContext->getPlatform()->ZeCommandListImmediateAppendExt.Supported) { + UR_LOG(ERR, "Adapter v2 is used but the current driver does not support " + "the zeCommandListImmediateAppendCommandListsExp entrypoint."); + throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + this->hContext = hContext; + this->hDevice = hDevice; + + this->flags = flags; + + eventPoolRegular = hContext->getEventPoolCache(PoolCacheType::Regular) + .borrow(hDevice->Id.value(), v2::EVENT_FLAGS_COUNTER); +} + +ur_event_handle_t ur_queue_batched_t::createEventIfRequestedRegular( + ur_event_handle_t *phEvent, ur_event_generation_t batch_generation) { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::createEventIfRequested"); + + if (phEvent == nullptr) { + return nullptr; + } + + (*phEvent) = eventPoolRegular->allocate(); + (*phEvent)->setQueue(this); + (*phEvent)->setBatch(batch_generation); + + return (*phEvent); +} + +ur_event_handle_t ur_queue_batched_t::createEventAndRetainRegular( + ur_event_handle_t *phEvent, ur_event_generation_t batch_generation) { + auto hEvent = eventPoolRegular->allocate(); + hEvent->setQueue(this); + hEvent->setBatch(batch_generation); + + if (phEvent) { + (*phEvent) = hEvent; + hEvent->retain(); + } + + return hEvent; +} + +ur_result_t batch_manager::renewRegularUnlocked( + v2::raii::command_list_unique_handle &&newRegularBatch) { + TRACK_SCOPE_LATENCY("batch_manager::renewRegularUnlocked"); + + regularGenerationNumber++; + + // save the previous regular command list for execution + runBatches.push_back(activeBatch.releaseCommandList()); + // renew the regular command list (current batch) + activeBatch.replaceCommandList( + std::forward(newRegularBatch)); + + setBatchEmpty(); + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::renewBatchUnlocked(locked &batchLocked) { + if (batchLocked->isLimitOfUsedCommandListsReached()) { + UR_CALL(queueFinishUnlocked(batchLocked)); + } + + return batchLocked->renewRegularUnlocked(getNewRegularCmdList()); +} + +ur_result_t batch_manager::enqueueCurrentBatchUnlocked() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueCurrentBatchUnlocked"); + + ze_command_list_handle_t regularList = activeBatch.getZeCommandList(); + { + TRACK_SCOPE_LATENCY( + "ur_queue_batched_t::enqueueCurrentBatchUnlocked_finalize"); + // finalize + ZE2UR_CALL(zeCommandListClose, (regularList)); + } + { + TRACK_SCOPE_LATENCY( + "ur_queue_batched_t::enqueueCurrentBatchUnlocked_runBatchAppend"); + // run batch + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (immediateList.getZeCommandList(), 1, ®ularList, nullptr, 0, + nullptr)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::onEventWaitListUse(ur_event_generation_t batch_generation) { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::onEventWaitListUse"); + + auto batchLocked = currentCmdLists.lock(); + if (batchLocked->isCurrentGeneration(batch_generation)) { + return queueFlushUnlocked(batchLocked); + } else { + return UR_RESULT_SUCCESS; + } +} + +ur_result_t ur_queue_batched_t::enqueueKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueKernelLaunch"); + auto currentRegular = currentCmdLists.lock(); + + currentRegular->markIssuedCommand(); + + UR_CALL(currentRegular->getActiveBatch().appendKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, waitListView, + createEventIfRequestedRegular(phEvent, + currentRegular->getCurrentGeneration()))); + + return UR_RESULT_SUCCESS; +} + +ur_result_t batch_manager::hostSynchronize() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::hostSynchronize"); + + ZE2UR_CALL(zeCommandListHostSynchronize, + (immediateList.getZeCommandList(), UINT64_MAX)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::queueFinishPoolsUnlocked() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::asyncPools"); + + hContext->getAsyncPool()->cleanupPoolsForQueue(this); + hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { + hPool->cleanupPoolsForQueue(this); + return true; + }); + + return UR_RESULT_SUCCESS; +} + +ur_result_t batch_manager::batchFinish() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::batchFinish"); + + { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::releaseSubmittedKernels"); + UR_CALL(immediateList.releaseSubmittedKernels()); + } + + { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::resetRegCmdlist"); + ZE2UR_CALL(zeCommandListReset, (activeBatch.getZeCommandList())); + } + + runBatches.clear(); + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::queueFinishUnlocked(locked &batchLocked) { + UR_CALL(batchLocked->enqueueCurrentBatchUnlocked()); + UR_CALL(batchLocked->hostSynchronize()); + + UR_CALL(queueFinishPoolsUnlocked()); + + return batchLocked->batchFinish(); +} + +ur_result_t ur_queue_batched_t::queueFinish() { + try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::queueFinish"); + // finish current batch + auto lockedBatches = currentCmdLists.lock(); + return queueFinishUnlocked(lockedBatches); + + } catch (...) { + return exceptionToResult(std::current_exception()); + } +} + +ur_queue_batched_t::~ur_queue_batched_t() { + try { + UR_CALL_THROWS(queueFinish()); + } catch (...) { + // Ignore errors during destruction + } +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferRead( + ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, + void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueMemBufferRead"); + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatches = currentCmdLists.lock(); + + lockedBatches->markIssuedCommand(); + + UR_CALL(lockedBatches->getActiveBatch().appendMemBufferRead( + hBuffer, false, offset, size, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatches->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatches)); + } + + return UR_RESULT_SUCCESS; + } catch (...) { + return exceptionToResult(std::current_exception()); + } +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferWrite( + ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, + const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueMemBufferWrite"); + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatches = currentCmdLists.lock(); + + lockedBatches->markIssuedCommand(); + + UR_CALL(lockedBatches->getActiveBatch().appendMemBufferWrite( + hBuffer, false, offset, size, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatches->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatches)); + } + + return UR_RESULT_SUCCESS; +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t ur_queue_batched_t::enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendDeviceGlobalVariableWrite( + hProgram, name, false, count, offset, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendDeviceGlobalVariableRead( + hProgram, name, false, count, offset, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferFill( + ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) try { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::enqueueMemBufferFill"); + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); + +} catch (...) { + return exceptionToResult(std::current_exception()); +} + +ur_result_t ur_queue_batched_t::enqueueUSMMemcpy( + bool blocking, void *pDst, const void *pSrc, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendUSMMemcpy( + false, pDst, pSrc, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMFreeExp( + ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendUSMFreeExp( + this, pPool, pMem, waitListView, + createEventAndRetainRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + return queueFlushUnlocked(lockedBatch); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferMap( + ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + void **ppRetMap) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemBufferMap( + hBuffer, false, mapFlags, offset, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + ppRetMap)); + + if (blockingMap) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemUnmap( + ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemUnmap( + hMem, pMappedPtr, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferReadRect( + ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, + size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, + void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemBufferReadRect( + hBuffer, false, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferWriteRect( + ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, + size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, + void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemBufferWriteRect( + hBuffer, false, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) { + wait_list_view emptyWaitList = wait_list_view(nullptr, 0, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAdvise( + pMem, size, advice, emptyWaitList, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueUSMMemcpy2D( + bool blocking, void *pDst, size_t dstPitch, const void *pSrc, + size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendUSMMemcpy2D( + false, pDst, dstPitch, pSrc, srcPitch, width, height, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMFill2D( + void *pMem, size_t pitch, size_t patternSize, const void *pPattern, + size_t width, size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMFill2D( + pMem, pitch, patternSize, pPattern, width, height, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueUSMPrefetch( + const void *pMem, size_t size, ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMPrefetch( + pMem, size, flags, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferCopyRect( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemBufferCopyRect( + hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueEventsWaitWithBarrier( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { + UR_CALL(lockedBatch->getActiveBatch().appendEventsWaitWithBarrier( + waitListView, createEventIfRequestedRegular( + phEvent, lockedBatch->getCurrentGeneration()))); + } else { + UR_CALL(lockedBatch->getActiveBatch().appendEventsWait( + waitListView, createEventIfRequestedRegular( + phEvent, lockedBatch->getCurrentGeneration()))); + } + + return queueFlushUnlocked(lockedBatch); +} + +ur_result_t +ur_queue_batched_t::enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendEventsWait( + waitListView, createEventIfRequestedRegular( + phEvent, lockedBatch->getCurrentGeneration()))); + + return queueFlushUnlocked(lockedBatch); +} + +ur_result_t ur_queue_batched_t::enqueueMemBufferCopy( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemBufferCopy( + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueUSMFill( + void *pMem, size_t patternSize, const void *pPattern, size_t size, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMFill( + pMem, patternSize, pPattern, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueMemImageRead( + ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemImageRead( + hImage, false, origin, region, rowPitch, slicePitch, pDst, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingRead) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemImageWrite( + ur_mem_handle_t hImage, bool blockingWrite, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendMemImageWrite( + hImage, false, origin, region, rowPitch, slicePitch, pSrc, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blockingWrite) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueMemImageCopy( + ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendMemImageCopy( + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueReadHostPipe( + ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, + void *pDst, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendReadHostPipe( + hProgram, pipe_symbol, false, pDst, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueWriteHostPipe( + ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + UR_CALL(lockedBatch->getActiveBatch().appendWriteHostPipe( + hProgram, pipe_symbol, false, pSrc, size, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()))); + + if (blocking) { + UR_CALL(queueFinishUnlocked(lockedBatch)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAllocHelper( + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + UR_USM_TYPE_DEVICE); +} + +ur_result_t ur_queue_batched_t::enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAllocHelper( + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + UR_USM_TYPE_SHARED); +} + +ur_result_t ur_queue_batched_t::enqueueUSMHostAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendUSMAllocHelper( + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration()), + UR_USM_TYPE_HOST); +} + +ur_result_t ur_queue_batched_t::bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, + ur_exp_image_copy_input_types_t imageCopyInputTypes, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().bindlessImagesImageCopyExp( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, imageCopyInputTypes, + waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().bindlessImagesWaitExternalSemaphoreExp( + hSemaphore, hasWaitValue, waitValue, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().bindlessImagesSignalExternalSemaphoreExp( + hSemaphore, hasSignalValue, signalValue, waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +// In case of queues with batched submissions, which use regular command lists +// (similarly to command buffers), the start timestamp would be recorded as the +// operation is submitted (event.recordStartTimestamp() in +// appendTimestampRecordingExp does not use the queue but directly the device), +// but the end timestamp would wait for the submission of the given regular +// command list. The difference between the start and end timestamps would +// reflect the delay in the batch submission, the difference between end +// timestamps would reflect the actual time of execution. +// +// TODO +// The version of timestampRecording for batched queues should be adjusted in +// order to reflect the idea behind the original function + +ur_result_t ur_queue_batched_t::enqueueTimestampRecordingExp( + bool /* blocking */, uint32_t /* numEventsInWaitList */, + const ur_event_handle_t * /* phEventWaitList */, + ur_event_handle_t * /* phEvent */) { + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + // wait_list_view waitListView = + // wait_list_view(phEventWaitList, numEventsInWaitList, this); + + // auto lockedBatch = currentCmdLists.lock(); + + // lockedBatch->markIssuedCommand(); + + // UR_CALL(lockedBatch->getActiveBatch().appendTimestampRecordingExp( + // false, waitListView, + // createEventIfRequestedRegular(phEvent, + // lockedBatch->getCurrentGeneration()))); + + // if (blocking) { + // UR_CALL(queueFinishUnlocked(lockedBatch)); + // } + + // return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_batched_t::enqueueCommandBufferExp( + ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendCommandBufferExp( + hCommandBuffer, waitListView, + createEventAndRetainRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList, this); + + auto lockedBatch = currentCmdLists.lock(); + + lockedBatch->markIssuedCommand(); + + return lockedBatch->getActiveBatch().appendNativeCommandExp( + pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, + waitListView, + createEventIfRequestedRegular(phEvent, + lockedBatch->getCurrentGeneration())); +} + +ur_result_t ur_queue_batched_t::queueGetInfo(ur_queue_info_t propName, + size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // TODO: consider support for queue properties and size + switch ((uint32_t)propName) { // cast to avoid warnings on EXT enum values + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hContext); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hDevice); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{RefCount.getCount()}); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(flags); + case UR_QUEUE_INFO_SIZE: + case UR_QUEUE_INFO_DEVICE_DEFAULT: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_QUEUE_INFO_EMPTY: { + bool isBatchEmpty = currentCmdLists.get_no_lock()->isActiveBatchEmpty(); + if (isBatchEmpty) { + auto status = ZE_CALL_NOCHECK( + zeCommandListHostSynchronize, + (currentCmdLists.get_no_lock()->getImmediateListHandle(), 0)); + if (status == ZE_RESULT_SUCCESS) { + return ReturnValue(true); + } else if (status == ZE_RESULT_NOT_READY) { + return ReturnValue(false); + } else { + return ze2urResult(status); + } + } else { + return ReturnValue(false); + } + } + default: + UR_LOG(ERR, + "Unsupported ParamName in urQueueGetInfo: " + "ParamName=ParamName={}(0x{})", + propName, logger::toHex(propName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::queueGetNativeHandle(ur_queue_native_desc_t * /*pDesc*/, + ur_native_handle_t *phNativeQueue) { + *phNativeQueue = reinterpret_cast( + currentCmdLists.get_no_lock()->getImmediateListHandle()); + return UR_RESULT_SUCCESS; +} + +ur_result_t +ur_queue_batched_t::queueFlushUnlocked(locked &batchLocked) { + UR_CALL(batchLocked->enqueueCurrentBatchUnlocked()); + + return renewBatchUnlocked(batchLocked); +} + +ur_result_t ur_queue_batched_t::queueFlush() { + auto batchLocked = currentCmdLists.lock(); + return queueFlushUnlocked(batchLocked); +} + +} // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp new file mode 100644 index 0000000000000..a2964d28f567c --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_batched.hpp @@ -0,0 +1,434 @@ +//===--------------- queue_batched.hpp - Level Zero Adapter ---------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "../common.hpp" +#include "../device.hpp" + +#include "command_list_cache.hpp" +#include "common/ur_ref_count.hpp" +#include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "memory.hpp" +#include "queue_api.hpp" + +#include "ur/ur.hpp" + +#include "command_buffer.hpp" +#include "command_list_manager.hpp" +#include "lockable.hpp" +#include "queue_immediate_in_order.hpp" +#include "ur_api.h" +#include "ze_api.h" + +// Batched queues enable submission of operations to the driver in batches, +// therefore reducing the overhead of submitting every single operation +// individually. Similarly to command buffers in L0v2, they use regular command +// lists (later referenced as 'batches'). Operations enqueued on regular command +// lists are not executed immediately, but only after enqueueing the regular +// command list on an immediate command list. However, in contrast to command +// buffers, batched queues also handle submission of batches (regular command +// lists) instead of only collecting enqueued operations, by using an internal +// immediate command list. Command lists are managed by a batch_manager inside a +// batched queue. +// +// Batched queues can be enabled by setting UR_QUEUE_FLAG_SUBMISSION_BATCHED in +// ur_queue_flags_t or globally, through the environment variable +// UR_L0_FORCE_BATCHED=1. + +namespace v2 { + +struct batch_manager { +private: + // The currently active regular command list, which may be replaced in the + // command list manager, submitted for execution on the immediate command list + // and stored in the vector of submitted batches while awaiting execution + // completion + ur_command_list_manager activeBatch; + // An immediate command list for submission of batches + ur_command_list_manager immediateList; + // Submitted batches (regular command lists), stored for the completion of + // their execution. After queueFinish(), the vector is cleared - at this + // point, the destructor of command_list_handle adds the given command list to + // the command list cache, to the stack assigned to the description of the + // command list. When a new regular command list is requested after + // queueFinish(), it is popped from the available stack rather than retrieved + // through a driver call, which improves performance. + std::vector runBatches; + // The generation number of the current batch, assigned to events associated + // with operations enqueued on the given batch. It is incremented during every + // replacement of the current batch. When an event created by a batched queue + // appears in an eventWaitList, the batch assigned to the given event might + // not have been executed yet and the event might never be signalled. + // Comparing generation numbers enables determining whether the current batch + // should be submitted for execution. If the generation number of the current + // batch is higher than the number assigned to the given event, the batch + // associated with the event has already been submitted for execution and + // additional submission of the current batch is not needed. + ur_event_generation_t regularGenerationNumber; + // The limit of regular command lists stored for execution; if exceeded, the + // vector is cleared as part of queueFinish and slots are renewed. + static constexpr uint64_t initialSlotsForBatches = 10; + // Whether any operation has been enqueued on the current batch + bool isEmpty = true; + +public: + batch_manager(ur_context_handle_t context, ur_device_handle_t device, + v2::raii::command_list_unique_handle &&commandListRegular, + v2::raii::command_list_unique_handle &&commandListImmediate) + : activeBatch(context, device, + std::forward( + commandListRegular)), + immediateList(context, device, + std::forward( + commandListImmediate)), + regularGenerationNumber(0) { + runBatches.reserve(initialSlotsForBatches); + } + + ur_result_t hostSynchronize(); + + ur_result_t + renewRegularUnlocked(v2::raii::command_list_unique_handle &&newRegularBatch); + + bool isCurrentGeneration(ur_event_generation_t batch_generation) { + return batch_generation == regularGenerationNumber; + } + + ur_result_t enqueueCurrentBatchUnlocked(); + + ur_command_list_manager &getActiveBatch() { return activeBatch; } + + ur_event_generation_t getCurrentGeneration() { + return regularGenerationNumber; + } + + ur_result_t batchFinish(); + + ze_command_list_handle_t getImmediateListHandle() { + return immediateList.getZeCommandList(); + } + + ze_command_list_handle_t getRegularListHandle() { + return activeBatch.getZeCommandList(); + } + + bool isActiveBatchEmpty() { return isEmpty; } + + void markIssuedCommand() { isEmpty = false; } + + void setBatchEmpty() { isEmpty = true; } + + bool isLimitOfUsedCommandListsReached() { + return initialSlotsForBatches <= runBatches.size(); + } +}; + +struct ur_queue_batched_t : ur_object, ur_queue_t_ { +private: + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + + v2::command_list_desc_t regularCmdListDesc; + lockable currentCmdLists; + + ur_queue_flags_t flags; + + // Regular command lists use the regular pool cache type, whereas immediate + // command lists use the immediate pool cache type. Since user-requested + // operations are enqueued on regular command lists and immediate command + // lists are only used internally by the batched queue implementation, events + // are not created for immediate command lists. + + v2::raii::cache_borrowed_event_pool eventPoolRegular; + + v2::raii::command_list_unique_handle getNewRegularCmdList() { + TRACK_SCOPE_LATENCY("ur_queue_batched_t::getNewRegularCmdList"); + + return hContext->getCommandListCache().getRegularCommandList( + hDevice->ZeDevice, regularCmdListDesc); + } + + ur_result_t renewBatchUnlocked(locked &batchLocked); + + ur_event_handle_t + createEventIfRequestedRegular(ur_event_handle_t *phEvent, + ur_event_generation_t generation_number); + + ur_event_handle_t + createEventAndRetainRegular(ur_event_handle_t *phEvent, + ur_event_generation_t batch_generation); + + ur_result_t queueFinishPoolsUnlocked(); + + ur_result_t queueFinishUnlocked(locked &batchLocked); + + ur_result_t queueFlushUnlocked(locked &batchLocked); + +public: + ur_queue_batched_t(ur_context_handle_t, ur_device_handle_t, uint32_t ordinal, + ze_command_queue_priority_t priority, + std::optional index, event_flags_t eventFlags, + ur_queue_flags_t flags); + + ur_result_t + onEventWaitListUse(ur_event_generation_t batch_generation) override; + + ~ur_queue_batched_t(); + + ur_result_t queueGetInfo(ur_queue_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) override; + ur_result_t queueGetNativeHandle(ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) override; + ur_result_t queueFinish() override; + ur_result_t queueFlush() override; + ur_result_t enqueueKernelLaunch( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_kernel_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, + phEvent); + } + + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, + const void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferReadRect( + ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, + size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, + size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferWriteRect( + ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferCopyRect( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, + void **ppRetMap) override; + + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, + size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, + ur_exp_image_copy_input_types_t imageCopyInputTypes, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t + enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur::RefCount RefCount; +}; + +} // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index be211cb198438..16138342dcdb3 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -12,9 +12,12 @@ #include "logger/ur_logger.hpp" #include "queue_api.hpp" +#include "queue_batched.hpp" #include "queue_handle.hpp" #include "queue_immediate_in_order.hpp" +static const bool ForceBatched = getenv_tobool("UR_L0_FORCE_BATCHED"); + namespace v2 { using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; @@ -62,11 +65,21 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, return UR_RESULT_ERROR_INVALID_DEVICE; } + TRACK_SCOPE_LATENCY("queueCreate"); + ur_queue_flags_t flags = 0; if (pProperties) { flags = pProperties->flags; } + if (ForceBatched) { + flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; + } + + // TODO remove | this is just for tests in CI + // As of 17.10.205, I still remmeber about removal + flags |= UR_QUEUE_FLAG_SUBMISSION_BATCHED; + auto zeIndex = v2::getZeIndex(pProperties); if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0) { @@ -75,6 +88,11 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + } else if (flags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { + *phQueue = ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), + zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + } else { *phQueue = ur_queue_handle_t_::create( hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp new file mode 100644 index 0000000000000..bd7fd23ab6721 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_extensions.hpp @@ -0,0 +1,22 @@ +// Copyright (C) 2025 Intel Corporation +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +#pragma once + +#include "ur_api.h" + +struct ur_queue_extensions { + // Non-batched queues don't need to perform any action + // + // This function is intended to be called by the event. If the event has been + // created by the given queue and is associated with the current batch, this + // batch should be enqueued for execution. Otherwise, the event would never be + // signalled + virtual ur_result_t + onEventWaitListUse([[maybe_unused]] int64_t batch_generation) { + return UR_RESULT_SUCCESS; + } +}; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp index c414f79a46d71..a90bd8a27868a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp @@ -16,13 +16,15 @@ #include #include "../common.hpp" +#include "queue_batched.hpp" #include "queue_immediate_in_order.hpp" #include "queue_immediate_out_of_order.hpp" #include struct ur_queue_handle_t_ : ur::handle_base { using data_variant = std::variant; + v2::ur_queue_immediate_out_of_order_t, + v2::ur_queue_batched_t>; data_variant queue_data; static constexpr uintptr_t queue_offset = diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 1e4beaf363da0..27ef4d6a5ca1b 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -105,16 +105,26 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() { auto lockedCommandListManager = commandListManager.lock(); - ZE2UR_CALL(zeCommandListHostSynchronize, - (lockedCommandListManager->getZeCommandList(), UINT64_MAX)); + { + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::hostSynchronize"); + ZE2UR_CALL(zeCommandListHostSynchronize, + (lockedCommandListManager->getZeCommandList(), UINT64_MAX)); + } - hContext->getAsyncPool()->cleanupPoolsForQueue(this); - hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { - hPool->cleanupPoolsForQueue(this); - return true; - }); + { + TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::asyncPools"); + hContext->getAsyncPool()->cleanupPoolsForQueue(this); + hContext->forEachUsmPool([this](ur_usm_pool_handle_t hPool) { + hPool->cleanupPoolsForQueue(this); + return true; + }); + } - UR_CALL(lockedCommandListManager->releaseSubmittedKernels()); + { + TRACK_SCOPE_LATENCY( + "ur_queue_immediate_in_order_t::releaseSubmittedKernels"); + UR_CALL(lockedCommandListManager->releaseSubmittedKernels()); + } return UR_RESULT_SUCCESS; } @@ -142,14 +152,15 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( // in this queue are completed when the signal is started. However, we do // need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { return commandListManager.lock()->appendEventsWaitWithBarrier( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } else { return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } } diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index 3f230861ad563..ba13ba507f11a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -60,10 +60,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_kernel_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, + numPropsInLaunchPropList, launchPropList, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -74,9 +76,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendEventsWait( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, @@ -92,9 +96,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferRead( - hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingRead, offset, size, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -104,9 +110,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferWrite( - hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingWrite, offset, size, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -117,10 +125,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferReadRect( hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numEventsInWaitList, phEventWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -131,11 +141,13 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferWriteRect( hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, @@ -144,9 +156,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferCopy( - hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -157,10 +171,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferCopyRect( hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -170,9 +186,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, + hBuffer, pPattern, patternSize, offset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -183,10 +201,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemImageRead( hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, @@ -196,10 +216,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemImageWrite( hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -208,9 +230,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemImageCopy( - hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, - phEventWaitList, + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -220,18 +244,23 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemBufferMap( - hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), - ppRetMap); + hBuffer, blockingMap, mapFlags, offset, size, waitListView, + createEventIfRequested(eventPool.get(), phEvent, this), ppRetMap); } ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendMemUnmap( - hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + hMem, pMappedPtr, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -240,8 +269,10 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); return commandListManager.lock()->appendUSMFill( - pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + pMem, patternSize, pPattern, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -249,8 +280,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMMemcpy( - blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + blocking, pDst, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -259,9 +293,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t height, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMFill2D( - pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, - phEventWaitList, + pMem, pitch, patternSize, pPattern, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -271,9 +307,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMMemcpy2D( - blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -282,16 +320,21 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMPrefetch( - pMem, size, flags, numEventsInWaitList, phEventWaitList, + pMem, size, flags, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) override { + wait_list_view emptyWaitList = wait_list_view(nullptr, 0); + return commandListManager.lock()->appendUSMAdvise( - pMem, size, advice, 0, nullptr, + pMem, size, advice, emptyWaitList, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -300,9 +343,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendDeviceGlobalVariableWrite( - hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, - phEventWaitList, + hProgram, name, blockingWrite, count, offset, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -311,9 +356,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendDeviceGlobalVariableRead( - hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, - phEventWaitList, + hProgram, name, blockingRead, count, offset, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -323,9 +370,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendReadHostPipe( - hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pDst, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -335,9 +384,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendWriteHostPipe( - hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -346,9 +397,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_DEVICE); } @@ -357,9 +411,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_SHARED); } @@ -369,9 +426,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_HOST); } @@ -379,8 +439,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendUSMFreeExp( - this, pPool, pMem, numEventsInWaitList, phEventWaitList, + this, pPool, pMem, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -394,11 +457,13 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { ur_exp_image_copy_input_types_t imageCopyInputTypes, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->bindlessImagesImageCopyExp( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, pDstImageFormat, pCopyRegion, imageCopyFlags, imageCopyInputTypes, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesWaitExternalSemaphoreExp( @@ -406,9 +471,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->bindlessImagesWaitExternalSemaphoreExp( - hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasWaitValue, waitValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -417,9 +484,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->bindlessImagesSignalExternalSemaphoreExp( - hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasSignalValue, signalValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -427,8 +496,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendTimestampRecordingExp( - blocking, numEventsInWaitList, phEventWaitList, + blocking, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -437,8 +509,11 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendCommandBufferExp( - hCommandBuffer, numEventsInWaitList, phEventWaitList, + hCommandBuffer, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -448,10 +523,12 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { const ur_exp_enqueue_native_command_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + return commandListManager.lock()->appendNativeCommandExp( pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur::RefCount RefCount; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp index dae2e42f93069..83f7181f004ed 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -10,6 +10,7 @@ #include "queue_immediate_out_of_order.hpp" #include "../common/latency_tracker.hpp" +#include "command_list_manager.hpp" #include "ur.hpp" namespace v2 { @@ -153,6 +154,9 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( // commands in this queue are completed when the signal is started. However, // we do need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + bool needsRealBarrier = (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0; auto barrierFn = needsRealBarrier ? &ur_command_list_manager::appendEventsWaitWithBarrier @@ -161,27 +165,30 @@ ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( auto commandListManagersLocked = commandListManagers.lock(); // Enqueue wait for the user-provider events on the first command list. - UR_CALL(commandListManagersLocked[0].appendEventsWait( - numEventsInWaitList, phEventWaitList, barrierEvents[0])); + UR_CALL(commandListManagersLocked[0].appendEventsWait(waitListView, + barrierEvents[0])); + + wait_list_view emptyWaitlist = wait_list_view(nullptr, 0); // Request barrierEvents[id] to be signaled on remaining command lists. for (size_t id = 1; id < numCommandLists; id++) { - UR_CALL(commandListManagersLocked[id].appendEventsWait(0, nullptr, + UR_CALL(commandListManagersLocked[id].appendEventsWait(emptyWaitlist, barrierEvents[id])); } // Enqueue barriers on all command lists by waiting on barrierEvents. + wait_list_view barrierEventsWaitList = + wait_list_view(barrierEvents.data(), numCommandLists); if (phEvent) { - UR_CALL( - std::invoke(barrierFn, commandListManagersLocked[0], numCommandLists, - barrierEvents.data(), - createEventIfRequested(eventPool.get(), phEvent, this))); + UR_CALL(std::invoke( + barrierFn, commandListManagersLocked[0], barrierEventsWaitList, + createEventIfRequested(eventPool.get(), phEvent, this))); } for (size_t id = phEvent ? 1 : 0; id < numCommandLists; id++) { UR_CALL(std::invoke(barrierFn, commandListManagersLocked[0], - numCommandLists, barrierEvents.data(), nullptr)); + barrierEventsWaitList, nullptr)); } return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp index f1ad68a62a1a8..d6ad83a78512f 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -1,4 +1,4 @@ -//===--------- queue_immediate_in_order.hpp - Level Zero Adapter ---------===// +//===------- queue_immediate_out_of_order.hpp - Level Zero Adapter --------===// // // Copyright (C) 2025 Intel Corporation // @@ -73,11 +73,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_kernel_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numPropsInLaunchPropList, launchPropList, numEventsInWaitList, - phEventWaitList, + numPropsInLaunchPropList, launchPropList, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -88,10 +90,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendEventsWait( - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, @@ -107,10 +111,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferRead( - hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingRead, offset, size, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -120,10 +126,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferWrite( - hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, + hBuffer, blockingWrite, offset, size, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -134,11 +142,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferReadRect( hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, - numEventsInWaitList, phEventWaitList, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -149,12 +159,14 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferWriteRect( hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, @@ -163,10 +175,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferCopy( - hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -177,11 +191,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferCopyRect( hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, + srcSlicePitch, dstRowPitch, dstSlicePitch, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -191,10 +207,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, + hBuffer, pPattern, patternSize, offset, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -205,11 +223,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemImageRead( hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, @@ -219,11 +239,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemImageWrite( hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t @@ -232,10 +254,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemImageCopy( - hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, - phEventWaitList, + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -245,20 +269,25 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, void **ppRetMap) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemBufferMap( - hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, - phEventWaitList, createEventIfRequested(eventPool.get(), phEvent, this), - ppRetMap); + hBuffer, blockingMap, mapFlags, offset, size, waitListView, + createEventIfRequested(eventPool.get(), phEvent, this), ppRetMap); } ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendMemUnmap( - hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + hMem, pMappedPtr, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -267,9 +296,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMFill( - pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + pMem, patternSize, pPattern, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -277,9 +309,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMMemcpy( - blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + blocking, pDst, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -288,10 +323,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t height, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMFill2D( - pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, - phEventWaitList, + pMem, pitch, patternSize, pPattern, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -301,10 +338,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMMemcpy2D( - blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -313,18 +352,23 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMPrefetch( - pMem, size, flags, numEventsInWaitList, phEventWaitList, + pMem, size, flags, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, ur_event_handle_t *phEvent) override { + wait_list_view emptyWaitList = wait_list_view(nullptr, 0); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAdvise( - pMem, size, advice, 0, nullptr, + pMem, size, advice, emptyWaitList, /* 0, nullptr, */ createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -333,11 +377,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .appendDeviceGlobalVariableWrite( - hProgram, name, blockingWrite, count, offset, pSrc, - numEventsInWaitList, phEventWaitList, + hProgram, name, blockingWrite, count, offset, pSrc, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -346,11 +392,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .appendDeviceGlobalVariableRead( - hProgram, name, blockingRead, count, offset, pDst, - numEventsInWaitList, phEventWaitList, + hProgram, name, blockingRead, count, offset, pDst, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -360,10 +408,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendReadHostPipe( - hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pDst, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -373,10 +423,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendWriteHostPipe( - hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, - phEventWaitList, + hProgram, pipe_symbol, blocking, pSrc, size, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -385,10 +437,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_DEVICE); } @@ -397,10 +452,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_SHARED); } @@ -410,10 +468,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMAllocHelper( - this, pPool, size, pProperties, numEventsInWaitList, phEventWaitList, - ppMem, createEventIfRequested(eventPool.get(), phEvent, this), + this, pPool, size, pProperties, waitListView, ppMem, + createEventIfRequested(eventPool.get(), phEvent, this), UR_USM_TYPE_HOST); } @@ -421,9 +482,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendUSMFreeExp( - this, pPool, pMem, numEventsInWaitList, phEventWaitList, + this, pPool, pMem, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -437,12 +501,14 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { ur_exp_image_copy_input_types_t imageCopyInputTypes, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].bindlessImagesImageCopyExp( pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, pDstImageFormat, pCopyRegion, imageCopyFlags, imageCopyInputTypes, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur_result_t bindlessImagesWaitExternalSemaphoreExp( @@ -450,11 +516,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .bindlessImagesWaitExternalSemaphoreExp( - hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasWaitValue, waitValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -463,11 +531,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .bindlessImagesSignalExternalSemaphoreExp( - hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, - phEventWaitList, + hSemaphore, hasSignalValue, signalValue, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -475,10 +545,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId] .appendTimestampRecordingExp( - blocking, numEventsInWaitList, phEventWaitList, + blocking, waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } @@ -487,9 +560,12 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendCommandBufferExp( - hCommandBuffer, numEventsInWaitList, phEventWaitList, + hCommandBuffer, waitListView, createEventAndRetain(eventPool.get(), phEvent, this)); } @@ -499,11 +575,13 @@ struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { const ur_exp_enqueue_native_command_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override { + wait_list_view waitListView = + wait_list_view(phEventWaitList, numEventsInWaitList); + auto commandListId = getNextCommandListId(); return commandListManagers.lock()[commandListId].appendNativeCommandExp( pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, - numEventsInWaitList, phEventWaitList, - createEventIfRequested(eventPool.get(), phEvent, this)); + waitListView, createEventIfRequested(eventPool.get(), phEvent, this)); } ur::RefCount RefCount; diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp index 5de7037519490..8ace8306088a2 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.cpp +++ b/unified-runtime/source/adapters/native_cpu/queue.cpp @@ -21,6 +21,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + ur_queue_flags_t flags = 0; switch (propName) { case UR_QUEUE_INFO_CONTEXT: @@ -31,6 +32,18 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueGetInfo(ur_queue_handle_t hQueue, return ReturnValue(hQueue->getReferenceCount()); case UR_QUEUE_INFO_EMPTY: return ReturnValue(hQueue->isEmpty()); + case UR_QUEUE_INFO_FLAGS: + // Support for UR_QUEUE_INFO_FLAGS in urQueueGetInfo is required by the + // enqueueTimestampRecording tests after introducing batched queues, since + // batched queues do not support enqueueTimestampRecording. + if (!hQueue->isInOrder()) { + flags |= UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE; + } + if (hQueue->isProfiling()) { + flags |= UR_QUEUE_FLAG_PROFILING_ENABLE; + } + + return ReturnValue(flags); default: return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; } diff --git a/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp b/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp index 488165e0761a8..8ff58c991bd01 100644 --- a/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/command_list_cache_test.cpp @@ -15,6 +15,7 @@ #include "uur/fixtures.h" #include "uur/raii.h" +#include "uur/utils.h" #include #include @@ -186,6 +187,7 @@ TEST_P(CommandListCacheTest, ImmediateCommandListsHaveProperAttributes) { TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { static constexpr int NumQueuesPerType = 5; size_t NumUniqueQueueTypes = 0; + bool isBatched = false; for (int I = 0; I < NumQueuesPerType; I++) { NumUniqueQueueTypes = 0; @@ -216,6 +218,8 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { ASSERT_EQ(urQueueCreate(context, device, &QueueProps, Queue.ptr()), UR_RESULT_SUCCESS); + ASSERT_NO_FATAL_FAILURE(uur::isQueueBatched(Queue, &isBatched)); + Queues.emplace_back(Queue); } } @@ -227,7 +231,13 @@ TEST_P(CommandListCacheTest, CommandListsAreReusedByQueues) { ASSERT_EQ(context->getCommandListCache().getNumImmediateCommandLists(), NumUniqueQueueTypes); - ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0); + + if (isBatched) { + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), + NumUniqueQueueTypes); + } else { + ASSERT_EQ(context->getCommandListCache().getNumRegularCommandLists(), 0); + } } } diff --git a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp index 2de31b830895a..a90959dc2eef1 100644 --- a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp @@ -24,6 +24,7 @@ #include "event_provider_counter.hpp" #include "event_provider_normal.hpp" #include "queue_handle.hpp" +#include "uur/checks.h" #include "uur/fixtures.h" #include "ze_api.h" @@ -277,6 +278,7 @@ TEST_P(EventPoolTestWithQueue, WithTimestamp) { GTEST_SKIP() << "Profiling needs to be enabled"; } + SKIP_IF_BATCHED_QUEUE(queue); auto zeEvent = createZeEvent(context, device); ur_event_handle_t hEvent; diff --git a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp index 5f65044f66ad9..e0936cc795cc1 100644 --- a/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp +++ b/unified-runtime/test/conformance/enqueue/urEnqueueTimestampRecording.cpp @@ -4,12 +4,16 @@ // // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +#include "uur/checks.h" +#include #include #include struct urEnqueueTimestampRecordingExpTest : uur::urQueueTest { void SetUp() override { UUR_RETURN_ON_FATAL_FAILURE(urQueueTest::SetUp()); + + SKIP_IF_BATCHED_QUEUE(queue); bool timestamp_recording_support = false; ASSERT_SUCCESS( uur::GetTimestampRecordingSupport(device, timestamp_recording_support)); diff --git a/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp b/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp index 05b1a0f76ff0c..eeb4bb2530bdb 100644 --- a/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp +++ b/unified-runtime/test/conformance/queue/urQueueGetInfo.cpp @@ -125,8 +125,6 @@ TEST_P(urQueueGetInfoTest, SuccessRoundtripNullDevice) { } TEST_P(urQueueGetInfoTest, SuccessFlags) { - UUR_KNOWN_FAILURE_ON(uur::NativeCPU{}); - size_t property_size = 0; const ur_queue_info_t property_name = UR_QUEUE_INFO_FLAGS; diff --git a/unified-runtime/test/conformance/testing/include/uur/utils.h b/unified-runtime/test/conformance/testing/include/uur/utils.h index 179d8d583efe8..199ce2d3ed05d 100644 --- a/unified-runtime/test/conformance/testing/include/uur/utils.h +++ b/unified-runtime/test/conformance/testing/include/uur/utils.h @@ -489,6 +489,30 @@ getDriverVersion(ur_device_handle_t hDevice) { } \ } while (0) +#define SKIP_IF_BATCHED_QUEUE(queue) \ + do { \ + ur_queue_flags_t queueFlags{}; \ + ASSERT_EQ(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, \ + sizeof(ur_queue_flags_t), &queueFlags, nullptr), \ + UR_RESULT_SUCCESS); \ + \ + if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { \ + UUR_KNOWN_FAILURE_ON(uur::LevelZeroV2{}); \ + } \ + } while (0) + +inline void isQueueBatched(ur_queue_handle_t queue, bool *info) { + ur_queue_flags_t queueFlags{}; + ASSERT_EQ(urQueueGetInfo(queue, UR_QUEUE_INFO_FLAGS, sizeof(ur_queue_flags_t), + &queueFlags, nullptr), + UR_RESULT_SUCCESS); + if (queueFlags & UR_QUEUE_FLAG_SUBMISSION_BATCHED) { + *info = true; + } else { + *info = false; + } +} + // Is this a Data Center GPU Max series (aka PVC)? // TODO: change to use // https://spec.oneapi.io/level-zero/latest/core/api.html#ze-device-ip-version-ext-t