diff --git a/devops/scripts/benchmarks/benches/compute.py b/devops/scripts/benchmarks/benches/compute.py index 3181c0e51eaf1..d1b6c7574094b 100644 --- a/devops/scripts/benchmarks/benches/compute.py +++ b/devops/scripts/benchmarks/benches/compute.py @@ -367,7 +367,7 @@ def bin_args(self) -> list[str]: "--iterations=100000", "--Profiling=0", "--NumKernels=10", - "--KernelExecTime=1", + "--KernelExecTime=20", f"--UseEvents={self.UseEvents}", ] diff --git a/unified-runtime/source/adapters/level_zero/CMakeLists.txt b/unified-runtime/source/adapters/level_zero/CMakeLists.txt index 1f8be4c625ee9..096e25a032657 100644 --- a/unified-runtime/source/adapters/level_zero/CMakeLists.txt +++ b/unified-runtime/source/adapters/level_zero/CMakeLists.txt @@ -186,6 +186,7 @@ if(UR_BUILD_ADAPTER_L0_V2) ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_api.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_create.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_in_order.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/v2/queue_immediate_out_of_order.cpp ${CMAKE_CURRENT_SOURCE_DIR}/v2/usm.cpp ) install_ur_library(ur_adapter_level_zero_v2) diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index a6541cff99adf..af6b4d09fc8b8 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -68,8 +68,7 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( const ur_exp_command_buffer_desc_t *desc) : commandListManager( context, device, - std::forward(commandList), - v2::EVENT_FLAGS_COUNTER, nullptr), + std::forward(commandList)), isUpdatable(desc ? desc->isUpdatable : false), context(context), device(device) {} @@ -246,7 +245,7 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( commandListLocked, hKernel, workDim, pGlobalWorkSize, numKernelAlternatives, kernelAlternatives, command)); } - UR_CALL(commandListLocked->appendKernelLaunch( + UR_CALL(commandListLocked->enqueueKernelLaunch( hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, nullptr, nullptr)); return UR_RESULT_SUCCESS; @@ -266,8 +265,8 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMMemcpy(false, pDst, pSrc, size, 0, - nullptr, nullptr)); + UR_CALL(commandListLocked->enqueueUSMMemcpy(false, pDst, pSrc, size, 0, + nullptr, nullptr)); return UR_RESULT_SUCCESS; } catch (...) { @@ -289,7 +288,7 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( // sync mechanic can be ignored, because all lists are in-order // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferCopy( + UR_CALL(commandListLocked->enqueueMemBufferCopy( hSrcMem, hDstMem, srcOffset, dstOffset, size, 0, nullptr, nullptr)); return UR_RESULT_SUCCESS; @@ -312,8 +311,8 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( // sync mechanic can be ignored, because all lists are in-order // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferWrite(hBuffer, false, offset, size, - pSrc, 0, nullptr, nullptr)); + UR_CALL(commandListLocked->enqueueMemBufferWrite(hBuffer, false, offset, size, + pSrc, 0, nullptr, nullptr)); return UR_RESULT_SUCCESS; } catch (...) { @@ -334,8 +333,8 @@ ur_result_t urCommandBufferAppendMemBufferReadExp( // the same issue as in urCommandBufferAppendKernelLaunchExp // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferRead(hBuffer, false, offset, size, - pDst, 0, nullptr, nullptr)); + UR_CALL(commandListLocked->enqueueMemBufferRead(hBuffer, false, offset, size, + pDst, 0, nullptr, nullptr)); return UR_RESULT_SUCCESS; } catch (...) { @@ -359,7 +358,7 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( // sync mechanic can be ignored, because all lists are in-order // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferCopyRect( + UR_CALL(commandListLocked->enqueueMemBufferCopyRect( hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, 0, nullptr, nullptr)); @@ -385,7 +384,7 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferWriteRect( + UR_CALL(commandListLocked->enqueueMemBufferWriteRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, 0, nullptr, nullptr)); @@ -412,7 +411,7 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferReadRect( + UR_CALL(commandListLocked->enqueueMemBufferReadRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, 0, nullptr, nullptr)); @@ -434,8 +433,8 @@ ur_result_t urCommandBufferAppendUSMFillExp( ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMFill(pMemory, patternSize, pPattern, size, - 0, nullptr, nullptr)); + UR_CALL(commandListLocked->enqueueUSMFill(pMemory, patternSize, pPattern, + size, 0, nullptr, nullptr)); return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -454,7 +453,7 @@ ur_result_t urCommandBufferAppendMemBufferFillExp( // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferFill( + UR_CALL(commandListLocked->enqueueMemBufferFill( hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr)); return UR_RESULT_SUCCESS; } catch (...) { @@ -475,8 +474,8 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMPrefetch(pMemory, size, flags, 0, nullptr, - nullptr)); + UR_CALL(commandListLocked->enqueueUSMPrefetch(pMemory, size, flags, 0, + nullptr, nullptr)); return UR_RESULT_SUCCESS; } catch (...) { @@ -496,7 +495,7 @@ ur_result_t urCommandBufferAppendUSMAdviseExp( // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMAdvise(pMemory, size, advice, nullptr)); + UR_CALL(commandListLocked->enqueueUSMAdvise(pMemory, size, advice, nullptr)); return UR_RESULT_SUCCESS; } catch (...) { @@ -546,13 +545,13 @@ ur_result_t urCommandBufferAppendNativeCommandExp( // Barrier on all commands before user defined commands. auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendBarrier(0, nullptr, nullptr)); + UR_CALL(commandListLocked->enqueueEventsWaitWithBarrier(0, nullptr, nullptr)); // Call user-defined function immediately pfnNativeCommand(pData); // Barrier on all commands after user defined commands. - UR_CALL(commandListLocked->appendBarrier(0, nullptr, nullptr)); + UR_CALL(commandListLocked->enqueueEventsWaitWithBarrier(0, nullptr, nullptr)); return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index d6f865d80b5c3..8ce3b4048babc 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -12,37 +12,34 @@ #include "../helpers/kernel_helpers.hpp" #include "../helpers/memory_helpers.hpp" #include "../ur_interface_loader.hpp" +#include "command_buffer.hpp" #include "context.hpp" #include "kernel.hpp" ur_command_list_manager::ur_command_list_manager( ur_context_handle_t context, ur_device_handle_t device, - v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags, - ur_queue_t_ *queue) - : context(context), device(device), - eventPool(context->getEventPoolCache().borrow(device->Id.value(), flags)), - zeCommandList(std::move(commandList)), queue(queue) { + v2::raii::command_list_unique_handle &&commandList) + : hContext(context), hDevice(device), + zeCommandList(std::move(commandList)) { UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } ur_command_list_manager::~ur_command_list_manager() { - ur::level_zero::urContextRelease(context); - ur::level_zero::urDeviceRelease(device); + ur::level_zero::urContextRelease(hContext); + ur::level_zero::urDeviceRelease(hDevice); } ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_mem_buffer_t *dst, size_t offset, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType) { - - auto zeSignalEvent = getSignalEvent(phEvent, commandType); + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + auto zeSignalEvent = getSignalEvent(phEvent); auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pDst = ur_cast(dst->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, + hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, offset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, (zeCommandList.get(), dst, src, size, nullptr, @@ -78,15 +75,13 @@ ur_result_t ur_command_list_manager::appendGenericFillUnlocked( ur_result_t ur_command_list_manager::appendGenericCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType) { - auto zeSignalEvent = getSignalEvent(phEvent, commandType); - + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + auto zeSignalEvent = getSignalEvent(phEvent); auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::read_only, srcOffset, size, - [&](void *src, void *dst, size_t size) { + hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, srcOffset, + size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, (zeCommandList.get(), dst, src, size, nullptr, waitListView.num, waitListView.handles)); @@ -94,7 +89,7 @@ ur_result_t ur_command_list_manager::appendGenericCopyUnlocked( })); auto pDst = ur_cast(dst->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::write_only, dstOffset, + hDevice, ur_mem_buffer_t::device_access_mode_t::write_only, dstOffset, size, [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, (zeCommandList.get(), dst, src, size, nullptr, @@ -118,17 +113,15 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType) { + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { auto zeParams = ur2zeRegionParams(srcOrigin, dstOrigin, region, srcRowPitch, dstRowPitch, srcSlicePitch, dstSlicePitch); - auto zeSignalEvent = getSignalEvent(phEvent, commandType); - + auto zeSignalEvent = getSignalEvent(phEvent); auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto pSrc = ur_cast(src->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::read_only, 0, + hDevice, ur_mem_buffer_t::device_access_mode_t::read_only, 0, src->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, (zeCommandList.get(), dst, src, size, nullptr, @@ -136,7 +129,7 @@ ur_result_t ur_command_list_manager::appendRegionCopyUnlocked( waitListView.clear(); })); auto pDst = ur_cast(dst->getDevicePtr( - device, ur_mem_buffer_t::device_access_mode_t::write_only, 0, + hDevice, ur_mem_buffer_t::device_access_mode_t::write_only, 0, dst->getSize(), [&](void *src, void *dst, size_t size) { ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, (zeCommandList.get(), dst, src, size, nullptr, @@ -174,23 +167,20 @@ wait_list_view ur_command_list_manager::getWaitListView( } ze_event_handle_t -ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent, - ur_command_t commandType) { +ur_command_list_manager::getSignalEvent(ur_event_handle_t *hUserEvent) { if (hUserEvent) { - *hUserEvent = eventPool->allocate(); - (*hUserEvent)->resetQueueAndCommand(queue, commandType); return (*hUserEvent)->getZeEvent(); } else { return nullptr; } } -ur_result_t ur_command_list_manager::appendKernelLaunch( +ur_result_t ur_command_list_manager::enqueueKernelLaunch( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendKernelLaunch"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueKernelLaunch"); UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); @@ -198,18 +188,17 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(device); + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); std::scoped_lock Lock(hKernel->Mutex); ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, device, + UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, zeThreadGroupDimensions, WG, workDim, pGlobalWorkSize, pLocalWorkSize)); - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_KERNEL_LAUNCH); - + auto zeSignalEvent = getSignalEvent(phEvent); auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); auto memoryMigrate = [&](void *src, void *dst, size_t size) { @@ -229,7 +218,7 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( pGlobalWorkOffset = NULL; } - UR_CALL(hKernel->prepareForSubmission(context, device, pGlobalWorkOffset, + UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, workDim, WG[0], WG[1], WG[2], memoryMigrate)); @@ -244,14 +233,13 @@ ur_result_t ur_command_list_manager::appendKernelLaunch( return UR_RESULT_SUCCESS; } -ur_result_t ur_command_list_manager::appendUSMMemcpy( +ur_result_t ur_command_list_manager::enqueueUSMMemcpy( bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy"); - - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_MEMCPY); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueUSMMemcpy"); + auto zeSignalEvent = getSignalEvent(phEvent); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -266,11 +254,11 @@ ur_result_t ur_command_list_manager::appendUSMMemcpy( return UR_RESULT_SUCCESS; } -ur_result_t ur_command_list_manager::appendMemBufferFill( +ur_result_t ur_command_list_manager::enqueueMemBufferFill( ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferFill"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferFill"); auto hBuffer = hMem->getBuffer(); UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); @@ -279,29 +267,28 @@ ur_result_t ur_command_list_manager::appendMemBufferFill( return appendGenericFillUnlocked(hBuffer, offset, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_FILL); + phEvent); } -ur_result_t ur_command_list_manager::appendUSMFill( +ur_result_t ur_command_list_manager::enqueueUSMFill( void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMFill"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueUSMFill"); - ur_usm_handle_t dstHandle(context, size, pMem); + ur_usm_handle_t dstHandle(hContext, size, pMem); return appendGenericFillUnlocked(&dstHandle, 0, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_USM_FILL); + phEvent); } -ur_result_t ur_command_list_manager::appendUSMPrefetch( +ur_result_t ur_command_list_manager::enqueueUSMPrefetch( const void *pMem, size_t size, ur_usm_migration_flags_t /*flags*/, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMPrefetch"); - - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_PREFETCH); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueUSMPrefetch"); + auto zeSignalEvent = getSignalEvent(phEvent); auto [pWaitEvents, numWaitEvents] = getWaitListView(phEventWaitList, numEventsInWaitList); @@ -321,15 +308,14 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( } ur_result_t -ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size, - ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); +ur_command_list_manager::enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueUSMAdvise"); auto zeAdvice = ur_cast(advice); - auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - + auto zeSignalEvent = getSignalEvent(phEvent); auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); if (pWaitEvents) { @@ -338,7 +324,7 @@ ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size, } ZE2UR_CALL(zeCommandListAppendMemAdvise, - (zeCommandList.get(), device->ZeDevice, pMem, size, zeAdvice)); + (zeCommandList.get(), hDevice->ZeDevice, pMem, size, zeAdvice)); if (zeSignalEvent) { ZE2UR_CALL(zeCommandListAppendSignalEvent, @@ -347,64 +333,47 @@ ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size, return UR_RESULT_SUCCESS; } -ur_result_t -ur_command_list_manager::appendBarrier(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendBarrier"); - - auto zeSignalEvent = - getSignalEvent(phEvent, UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(phEventWaitList, numEventsInWaitList); - - ZE2UR_CALL(zeCommandListAppendBarrier, - (zeCommandList.get(), zeSignalEvent, numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_command_list_manager::appendMemBufferRead( +ur_result_t ur_command_list_manager::enqueueMemBufferRead( ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferRead"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferRead"); auto hBuffer = hMem->getBuffer(); UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); - ur_usm_handle_t dstHandle(context, size, pDst); + ur_usm_handle_t dstHandle(hContext, size, pDst); std::scoped_lock lock(hBuffer->getMutex()); return appendGenericCopyUnlocked(hBuffer, &dstHandle, blockingRead, offset, 0, size, numEventsInWaitList, phEventWaitList, - phEvent, UR_COMMAND_MEM_BUFFER_READ); + phEvent); } -ur_result_t ur_command_list_manager::appendMemBufferWrite( +ur_result_t ur_command_list_manager::enqueueMemBufferWrite( ur_mem_handle_t hMem, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWrite"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferWrite"); auto hBuffer = hMem->getBuffer(); UR_ASSERT(offset + size <= hBuffer->getSize(), UR_RESULT_ERROR_INVALID_SIZE); - ur_usm_handle_t srcHandle(context, size, pSrc); + ur_usm_handle_t srcHandle(hContext, size, pSrc); std::scoped_lock lock(hBuffer->getMutex()); - return appendGenericCopyUnlocked( - &srcHandle, hBuffer, blockingWrite, 0, offset, size, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_WRITE); + return appendGenericCopyUnlocked(&srcHandle, hBuffer, blockingWrite, 0, + offset, size, numEventsInWaitList, + phEventWaitList, phEvent); } -ur_result_t ur_command_list_manager::appendMemBufferCopy( +ur_result_t ur_command_list_manager::enqueueMemBufferCopy( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopy"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferCopy"); auto hBufferSrc = hSrc->getBuffer(); auto hBufferDst = hDst->getBuffer(); @@ -419,57 +388,54 @@ ur_result_t ur_command_list_manager::appendMemBufferCopy( return appendGenericCopyUnlocked(hBufferSrc, hBufferDst, false, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_COPY); + phEventWaitList, phEvent); } -ur_result_t ur_command_list_manager::appendMemBufferReadRect( +ur_result_t ur_command_list_manager::enqueueMemBufferReadRect( ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferReadRect"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferReadRect"); auto hBuffer = hMem->getBuffer(); - ur_usm_handle_t dstHandle(context, 0, pDst); + ur_usm_handle_t dstHandle(hContext, 0, pDst); std::scoped_lock lock(hBuffer->getMutex()); return appendRegionCopyUnlocked( hBuffer, &dstHandle, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, - numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_READ_RECT); + numEventsInWaitList, phEventWaitList, phEvent); } -ur_result_t ur_command_list_manager::appendMemBufferWriteRect( +ur_result_t ur_command_list_manager::enqueueMemBufferWriteRect( ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferWriteRect"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferWriteRect"); auto hBuffer = hMem->getBuffer(); - ur_usm_handle_t srcHandle(context, 0, pSrc); + ur_usm_handle_t srcHandle(hContext, 0, pSrc); std::scoped_lock lock(hBuffer->getMutex()); return appendRegionCopyUnlocked( &srcHandle, hBuffer, blockingWrite, hostOrigin, bufferOrigin, region, hostRowPitch, hostSlicePitch, bufferRowPitch, bufferSlicePitch, - numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_WRITE_RECT); + numEventsInWaitList, phEventWaitList, phEvent); } -ur_result_t ur_command_list_manager::appendMemBufferCopyRect( +ur_result_t ur_command_list_manager::enqueueMemBufferCopyRect( ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendMemBufferCopyRect"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferCopyRect"); auto hBufferSrc = hSrc->getBuffer(); auto hBufferDst = hDst->getBuffer(); @@ -480,27 +446,526 @@ ur_result_t ur_command_list_manager::appendMemBufferCopyRect( return appendRegionCopyUnlocked( hBufferSrc, hBufferDst, false, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, - phEventWaitList, phEvent, UR_COMMAND_MEM_BUFFER_COPY_RECT); + phEventWaitList, phEvent); } -ur_result_t ur_command_list_manager::appendUSMMemcpy2D( +ur_result_t ur_command_list_manager::enqueueUSMMemcpy2D( bool blocking, void *pDst, size_t dstPitch, const void *pSrc, size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMMemcpy2D"); + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueUSMMemcpy2D"); ur_rect_offset_t zeroOffset{0, 0, 0}; ur_rect_region_t region{width, height, 0}; - ur_usm_handle_t srcHandle(context, 0, pSrc); - ur_usm_handle_t dstHandle(context, 0, pDst); + ur_usm_handle_t srcHandle(hContext, 0, pSrc); + ur_usm_handle_t dstHandle(hContext, 0, pDst); + + return appendRegionCopyUnlocked( + &srcHandle, &dstHandle, blocking, zeroOffset, zeroOffset, region, + srcPitch, 0, dstPitch, 0, numEventsInWaitList, phEventWaitList, phEvent); +} + +ur_result_t ur_command_list_manager::enqueueCooperativeKernelLaunchExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::enqueueCooperativeKernelLaunchExp"); + + UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); + UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); + + UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); + + ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); + + std::scoped_lock Lock(hKernel->Mutex); + + ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; + uint32_t WG[3]{}; + UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, + zeThreadGroupDimensions, WG, workDim, + pGlobalWorkSize, pLocalWorkSize)); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto memoryMigrate = [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (getZeCommandList(), dst, src, size, nullptr, + waitListView.num, waitListView.handles)); + waitListView.clear(); + }; + + // If the offset is {0, 0, 0}, pass NULL instead. + // This allows us to skip setting the offset. + bool hasOffset = false; + for (uint32_t i = 0; i < workDim; ++i) { + hasOffset |= pGlobalWorkOffset[i]; + } + if (!hasOffset) { + pGlobalWorkOffset = NULL; + } + + UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, + workDim, WG[0], WG[1], WG[2], + memoryMigrate)); + + TRACK_SCOPE_LATENCY("ur_command_list_manager::" + "zeCommandListAppendLaunchCooperativeKernel"); + ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, + (getZeCommandList(), hZeKernel, &zeThreadGroupDimensions, + zeSignalEvent, waitListView.num, waitListView.handles)); + + recordSubmittedKernel(hKernel); + + postSubmit(hZeKernel, pGlobalWorkOffset); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueTimestampRecordingExp( + bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueTimestampRecordingExp"); + + if (!phEvent) { + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + } + + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + (*phEvent)->recordStartTimestamp(); + + auto [timestampPtr, zeSignalEvent] = + (*phEvent)->getEventEndTimestampAndHandle(); + + ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, + (getZeCommandList(), timestampPtr, zeSignalEvent, numWaitEvents, + pWaitEvents)); + + if (blocking) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, + ur_event_handle_t additionalWaitEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueGenericCommandListsExp"); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto [pWaitEvents, numWaitEvents] = getWaitListView( + phEventWaitList, numEventsInWaitList, additionalWaitEvent); + + ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, + (getZeCommandList(), numCommandLists, phCommandLists, + zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueCommandBufferExp( + ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + auto bufferCommandListLocked = hCommandBuffer->commandListManager.lock(); + ze_command_list_handle_t commandBufferCommandList = + bufferCommandListLocked->zeCommandList.get(); + ur_event_handle_t internalEvent = nullptr; + if (phEvent == nullptr) { + phEvent = &internalEvent; + } + ur_event_handle_t executionEvent = + hCommandBuffer->getExecutionEventUnlocked(); + + UR_CALL(enqueueGenericCommandListsExp( + 1, &commandBufferCommandList, phEvent, numEventsInWaitList, + phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent)); + UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(*phEvent)); + if (internalEvent != nullptr) { + internalEvent->release(); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueMemImageRead( + ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemImageRead"); + + auto hImage = hMem->getImage(); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto [zeImage, zeRegion] = + hImage->getRWRegion(origin, region, rowPitch, slicePitch); + + ZE2UR_CALL(zeCommandListAppendImageCopyToMemory, + (getZeCommandList(), pDst, zeImage, &zeRegion, zeSignalEvent, + waitListView.num, waitListView.handles)); + + if (blockingRead) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueMemImageWrite( + ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemImageWrite"); + + auto hImage = hMem->getImage(); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto [zeImage, zeRegion] = + hImage->getRWRegion(origin, region, rowPitch, slicePitch); + + ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, + (getZeCommandList(), zeImage, pSrc, &zeRegion, zeSignalEvent, + waitListView.num, waitListView.handles)); + + if (blockingWrite) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueMemImageCopy( + ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, + ur_rect_offset_t dstOrigin, ur_rect_region_t region, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemImageWrite"); + + auto hImageSrc = hSrc->getImage(); + auto hImageDst = hDst->getImage(); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto desc = ur_mem_image_t::getCopyRegions(*hImageSrc, *hImageDst, srcOrigin, + dstOrigin, region); + + auto [zeImageSrc, zeRegionSrc] = desc.src; + auto [zeImageDst, zeRegionDst] = desc.dst; + + ZE2UR_CALL(zeCommandListAppendImageCopyRegion, + (getZeCommandList(), zeImageDst, zeImageSrc, &zeRegionDst, + &zeRegionSrc, zeSignalEvent, waitListView.num, + waitListView.handles)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueMemBufferMap( + ur_mem_handle_t hMem, bool blockingMap, ur_map_flags_t mapFlags, + size_t offset, size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, + void **ppRetMap) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemBufferMap"); + + auto hBuffer = hMem->getBuffer(); + + std::scoped_lock lock(hBuffer->getMutex()); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + auto pDst = ur_cast(hBuffer->mapHostPtr( + mapFlags, offset, size, [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (getZeCommandList(), dst, src, size, nullptr, + waitListView.num, waitListView.handles)); + waitListView.clear(); + })); + *ppRetMap = pDst; + + if (waitListView) { + // If memory was not migrated, we need to wait on the events here. + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (getZeCommandList(), waitListView.num, waitListView.handles)); + } + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (getZeCommandList(), zeSignalEvent)); + } + + if (blockingMap) { + ZE2UR_CALL(zeCommandListHostSynchronize, (getZeCommandList(), UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueMemUnmap( + ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueMemUnmap"); + + auto hBuffer = hMem->getBuffer(); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + // TODO: currently unmapHostPtr deallocates memory immediately, + // since the memory might be used by the user, we need to make sure + // all dependencies are completed. + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (getZeCommandList(), waitListView.num, waitListView.handles)); + waitListView.clear(); + + hBuffer->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) { + ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, + (getZeCommandList(), dst, src, size, nullptr, + waitListView.num, waitListView.handles)); + }); + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (getZeCommandList(), zeSignalEvent)); + } + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueUSMFill2D( + void * /*pMem*/, size_t /*pitch*/, size_t /*patternSize*/, + const void * /*pPattern*/, size_t /*width*/, size_t /*height*/, + uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t * /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} - return appendRegionCopyUnlocked(&srcHandle, &dstHandle, blocking, zeroOffset, - zeroOffset, region, srcPitch, 0, dstPitch, 0, - numEventsInWaitList, phEventWaitList, phEvent, - UR_COMMAND_MEM_BUFFER_COPY_RECT); +static void *getGlobalPointerFromModule(ze_module_handle_t hModule, + size_t offset, size_t count, + const char *name) { + // Find global variable pointer + size_t globalVarSize = 0; + void *globalVarPtr = nullptr; + ZE2UR_CALL_THROWS(zeModuleGetGlobalPointer, + (hModule, name, &globalVarSize, &globalVarPtr)); + if (globalVarSize < offset + count) { + setErrorMessage("Write device global variable is out of range.", + UR_RESULT_ERROR_INVALID_VALUE, + static_cast(ZE_RESULT_ERROR_INVALID_ARGUMENT)); + throw UR_RESULT_ERROR_ADAPTER_SPECIFIC; + } + return globalVarPtr; +} + +ur_result_t ur_command_list_manager::enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::enqueueDeviceGlobalVariableWrite"); + + // TODO: make getZeModuleHandle thread-safe + ze_module_handle_t zeModule = + hProgram->getZeModuleHandle(this->hDevice->ZeDevice); + + // Find global variable pointer + auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); + + // Locking is done inside enqueueUSMMemcpy + return enqueueUSMMemcpy(blockingWrite, ur_cast(globalVarPtr) + offset, + pSrc, count, numEventsInWaitList, phEventWaitList, + phEvent); +} + +ur_result_t ur_command_list_manager::enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY( + "ur_command_list_manager::enqueueDeviceGlobalVariableRead"); + + // TODO: make getZeModuleHandle thread-safe + ze_module_handle_t zeModule = + hProgram->getZeModuleHandle(this->hDevice->ZeDevice); + + // Find global variable pointer + auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); + + // Locking is done inside enqueueUSMMemcpy + return enqueueUSMMemcpy(blockingRead, pDst, + ur_cast(globalVarPtr) + offset, count, + numEventsInWaitList, phEventWaitList, phEvent); +} + +ur_result_t ur_command_list_manager::enqueueReadHostPipe( + ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, + bool /*blocking*/, void * /*pDst*/, size_t /*size*/, + uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t * /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::enqueueWriteHostPipe( + ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, + bool /*blocking*/, void * /*pSrc*/, size_t /*size*/, + uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t * /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t, const size_t, + const ur_exp_async_usm_alloc_properties_t *, uint32_t, + const ur_event_handle_t *, void **, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t, const size_t, + const ur_exp_async_usm_alloc_properties_t *, uint32_t, + const ur_event_handle_t *, void **, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::enqueueUSMHostAllocExp( + ur_usm_pool_handle_t, const size_t, + const ur_exp_async_usm_alloc_properties_t *, uint32_t, + const ur_event_handle_t *, void **, ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t +ur_command_list_manager::enqueueUSMFreeExp(ur_usm_pool_handle_t, void *, + uint32_t, const ur_event_handle_t *, + ur_event_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + auto zeSignalEvent = getSignalEvent(phEvent); + auto waitListView = getWaitListView(phEventWaitList, numEventsInWaitList); + + return bindlessImagesHandleCopyFlags( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, getZeCommandList(), + zeSignalEvent, waitListView.num, waitListView.handles); +} + +ur_result_t ur_command_list_manager::bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasWaitValue*/, + uint64_t /*waitValue*/, uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t * /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasSignalValue*/, + uint64_t /*signalValue*/, uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t * /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::enqueueKernelLaunchCustomExp( + ur_kernel_handle_t /*hKernel*/, uint32_t /*workDim*/, + const size_t * /*pGlobalWorkOffset*/, const size_t * /*pGlobalWorkSize*/, + const size_t * /*pLocalWorkSize*/, uint32_t /*numPropsInLaunchPropList*/, + const ur_exp_launch_property_t * /*launchPropList*/, + uint32_t /*numEventsInWaitList*/, + const ur_event_handle_t * /*phEventWaitList*/, + ur_event_handle_t * /*phEvent*/) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +ur_result_t ur_command_list_manager::enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t, void *, uint32_t, + const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { + UR_LOG_LEGACY( + ERR, logger::LegacyMessage("[UR][L0_v2] {} function not implemented!"), + "{} function not implemented!", __FUNCTION__); + + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +void ur_command_list_manager::recordSubmittedKernel( + ur_kernel_handle_t hKernel) { + submittedKernels.push_back(hKernel); + hKernel->RefCount.increment(); } ze_command_list_handle_t ur_command_list_manager::getZeCommandList() { return zeCommandList.get(); } + +ur_result_t ur_command_list_manager::enqueueEventsWait( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueEventsWait"); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + if (numWaitEvents > 0) { + ZE2UR_CALL(zeCommandListAppendWaitOnEvents, + (zeCommandList.get(), numWaitEvents, pWaitEvents)); + } + + if (zeSignalEvent) { + ZE2UR_CALL(zeCommandListAppendSignalEvent, + (zeCommandList.get(), zeSignalEvent)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::enqueueEventsWaitWithBarrier( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + TRACK_SCOPE_LATENCY("ur_command_list_manager::enqueueEventsWaitWithBarrier"); + + auto zeSignalEvent = getSignalEvent(phEvent); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); + + ZE2UR_CALL(zeCommandListAppendBarrier, + (zeCommandList.get(), zeSignalEvent, numWaitEvents, pWaitEvents)); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_command_list_manager::releaseSubmittedKernels() { + // Free deferred kernels + for (auto &hKernel : submittedKernels) { + UR_CALL(hKernel->release()); + } + submittedKernels.clear(); + return UR_RESULT_SUCCESS; +} \ No newline at end of file diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index 74c3f85ea3643..b8402c8347e8a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -35,11 +35,9 @@ struct wait_list_view { }; struct ur_command_list_manager { - ur_command_list_manager(ur_context_handle_t context, ur_device_handle_t device, - v2::raii::command_list_unique_handle &&commandList, - v2::event_flags_t flags, ur_queue_t_ *queue); + v2::raii::command_list_unique_handle &&commandList); ur_command_list_manager(const ur_command_list_manager &src) = delete; ur_command_list_manager(ur_command_list_manager &&src) = default; @@ -49,125 +47,243 @@ struct ur_command_list_manager { ~ur_command_list_manager(); - ur_result_t appendKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + /************ Helper methods *************/ + ur_result_t enqueueGenericCommandListsExp( + uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, + ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, + ur_event_handle_t additionalWaitEvent); - ur_result_t appendUSMMemcpy(bool blocking, void *pDst, const void *pSrc, - size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - ur_result_t appendMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, - size_t offset, size_t size, void *pDst, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + void recordSubmittedKernel(ur_kernel_handle_t hKernel); + + ur_result_t releaseSubmittedKernels(); + + ze_command_list_handle_t getZeCommandList(); + + wait_list_view + getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, + ur_event_handle_t additionalWaitEvent = nullptr); + ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent); - ur_result_t appendMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, - size_t offset, size_t size, const void *pSrc, + /************ Generic queue methods *************/ + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t + enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - - ur_result_t appendMemBufferCopy(ur_mem_handle_t hBufferSrc, - ur_mem_handle_t hBufferDst, size_t srcOffset, - size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - - ur_result_t appendMemBufferReadRect( + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, + const void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueMemBufferReadRect( ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - - ur_result_t appendMemBufferWriteRect( + ur_result_t enqueueMemBufferWriteRect( ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - - ur_result_t appendMemBufferCopyRect( + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueMemBufferCopyRect( ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - - ur_result_t appendUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, - const void *pSrc, size_t srcPitch, size_t width, - size_t height, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - - ur_result_t appendMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, - size_t patternSize, size_t offset, + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t + enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, void **ppRetMap); + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueUSMFill2D(void *, size_t, size_t, const void *, size_t, + size_t, uint32_t, const ur_event_handle_t *, + ur_event_handle_t *); + ur_result_t enqueueUSMMemcpy2D(bool, void *, size_t, const void *, size_t, + size_t, size_t, uint32_t, + const ur_event_handle_t *, + ur_event_handle_t *); + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent); + ur_result_t enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - - ur_result_t appendUSMFill(void *pMem, size_t patternSize, - const void *pPattern, size_t size, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - - ur_result_t appendUSMPrefetch(const void *pMem, size_t size, - ur_usm_migration_flags_t flags, + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent); + ur_result_t enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent); + ur_result_t + enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t *phEvent); + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - - ur_result_t appendUSMAdvise(const void *pMem, size_t size, - ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent); - - ur_result_t appendBarrier(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); - - ze_command_list_handle_t getZeCommandList(); - - wait_list_view - getWaitListView(const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent = nullptr); - ze_event_handle_t getSignalEvent(ur_event_handle_t *hUserEvent, - ur_command_t commandType); + ur_result_t bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + ur_result_t bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + ur_result_t bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + ur_result_t enqueueCooperativeKernelLaunchExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + ur_result_t + enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t + enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t enqueueKernelLaunchCustomExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_exp_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + ur_result_t + enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, + uint32_t, const ur_mem_handle_t *, + const ur_exp_enqueue_native_command_properties_t *, + uint32_t, const ur_event_handle_t *, + ur_event_handle_t *); private: ur_result_t appendGenericFillUnlocked( ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType); + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); - ur_result_t appendGenericCopyUnlocked( - ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, - size_t srcOffset, size_t dstOffset, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent, ur_command_t commandType); + ur_result_t + appendGenericCopyUnlocked(ur_mem_buffer_t *src, ur_mem_buffer_t *dst, + bool blocking, size_t srcOffset, size_t dstOffset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); ur_result_t appendRegionCopyUnlocked( ur_mem_buffer_t *src, ur_mem_buffer_t *dst, bool blocking, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType); - // UR context associated with this command-buffer - ur_context_handle_t context; - // Device associated with this command-buffer - ur_device_handle_t device; - v2::raii::cache_borrowed_event_pool eventPool; + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); + + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + + std::vector submittedKernels; v2::raii::command_list_unique_handle zeCommandList; - ur_queue_t_ *queue; std::vector waitList; }; diff --git a/unified-runtime/source/adapters/level_zero/v2/lockable.hpp b/unified-runtime/source/adapters/level_zero/v2/lockable.hpp index 92c78d88c4042..d88e3d3a454cb 100644 --- a/unified-runtime/source/adapters/level_zero/v2/lockable.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/lockable.hpp @@ -50,6 +50,10 @@ template struct lockable { std::unique_lock lock{mut_}; return locked(&object_, std::move(lock)); } + template locked lock() { + std::unique_lock lock{mut_}; + return locked(&object_, std::move(lock)); + } T *get_no_lock() { return &object_; } private: diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp index 60f82bfddb310..0adc0373e4b94 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_create.cpp @@ -14,6 +14,45 @@ #include "queue_api.hpp" #include "queue_handle.hpp" #include "queue_immediate_in_order.hpp" +#include "queue_immediate_out_of_order.hpp" + +namespace v2 { + +using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; + +static uint32_t getZeOrdinal(ur_device_handle_t hDevice) { + return hDevice->QueueGroup[queue_group_type::Compute].ZeOrdinal; +} + +static std::optional getZeIndex(const ur_queue_properties_t *pProps) { + if (pProps && pProps->pNext) { + const ur_base_properties_t *extendedDesc = + reinterpret_cast(pProps->pNext); + if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) { + const ur_queue_index_properties_t *indexProperties = + reinterpret_cast(extendedDesc); + return indexProperties->computeIndex; + } + } + return std::nullopt; +} + +static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { + if ((flags & UR_QUEUE_FLAG_PRIORITY_LOW) != 0) + return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; + if ((flags & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0) + return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; + return ZE_COMMAND_QUEUE_PRIORITY_NORMAL; +} + +static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { + event_flags_t eventFlags = EVENT_FLAGS_COUNTER; + if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) + eventFlags |= EVENT_FLAGS_PROFILING_ENABLED; + return eventFlags; +} + +} // namespace v2 namespace ur::level_zero { ur_result_t urQueueCreate(ur_context_handle_t hContext, @@ -24,9 +63,26 @@ ur_result_t urQueueCreate(ur_context_handle_t hContext, return UR_RESULT_ERROR_INVALID_DEVICE; } - // TODO: For now, always use immediate, in-order - *phQueue = ur_queue_handle_t_::create( - hContext, hDevice, pProperties); + ur_queue_flags_t flags = 0; + if (pProperties) { + flags = pProperties->flags; + } + + auto zeIndex = v2::getZeIndex(pProperties); + + if ((flags & UR_QUEUE_FLAG_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0 && + !zeIndex.has_value()) { + *phQueue = + ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), + v2::getZePriority(flags), v2::eventFlagsFromQueueFlags(flags), + flags); + } else { + *phQueue = ur_queue_handle_t_::create( + hContext, hDevice, v2::getZeOrdinal(hDevice), v2::getZePriority(flags), + zeIndex, v2::eventFlagsFromQueueFlags(flags), flags); + } + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -55,8 +111,19 @@ ur_result_t urQueueCreateWithNativeHandle( } } + auto commandListHandle = v2::raii::command_list_unique_handle( + reinterpret_cast(hNativeQueue), + [ownNativeHandle](ze_command_list_handle_t hZeCommandList) { + if (ownNativeHandle) { + if (checkL0LoaderTeardown()) { + ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); + } + } + }); + *phQueue = ur_queue_handle_t_::create( - hContext, hDevice, hNativeQueue, flags, ownNativeHandle); + hContext, hDevice, std::move(commandListHandle), + v2::eventFlagsFromQueueFlags(flags), flags); return UR_RESULT_SUCCESS; } catch (...) { diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp index 75bf4a16faf61..9831afdbc9e4c 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_handle.hpp @@ -15,11 +15,13 @@ #include "../common.hpp" #include "queue_immediate_in_order.hpp" +#include "queue_immediate_out_of_order.hpp" #include #include struct ur_queue_handle_t_ : ur::handle_base { - using data_variant = std::variant; + using data_variant = std::variant; data_variant queue_data; static constexpr uintptr_t queue_offset = diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 33c05a1402012..88eb0f5b593e5 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -23,83 +23,28 @@ namespace v2 { -wait_list_view ur_queue_immediate_in_order_t::getWaitListView( - locked &commandList, - const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent) { - return commandList->getWaitListView(phWaitEvents, numWaitEvents, - additionalWaitEvent); -} - -static uint32_t getZeOrdinal(ur_device_handle_t hDevice) { - return hDevice->QueueGroup[queue_group_type::Compute].ZeOrdinal; -} - -static std::optional getZeIndex(const ur_queue_properties_t *pProps) { - if (pProps && pProps->pNext) { - const ur_base_properties_t *extendedDesc = - reinterpret_cast(pProps->pNext); - if (extendedDesc->stype == UR_STRUCTURE_TYPE_QUEUE_INDEX_PROPERTIES) { - const ur_queue_index_properties_t *indexProperties = - reinterpret_cast(extendedDesc); - return indexProperties->computeIndex; - } - } - return std::nullopt; -} - -static ze_command_queue_priority_t getZePriority(ur_queue_flags_t flags) { - if ((flags & UR_QUEUE_FLAG_PRIORITY_LOW) != 0) - return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_LOW; - if ((flags & UR_QUEUE_FLAG_PRIORITY_HIGH) != 0) - return ZE_COMMAND_QUEUE_PRIORITY_PRIORITY_HIGH; - return ZE_COMMAND_QUEUE_PRIORITY_NORMAL; -} - -static event_flags_t eventFlagsFromQueueFlags(ur_queue_flags_t flags) { - event_flags_t eventFlags = EVENT_FLAGS_COUNTER; - if (flags & UR_QUEUE_FLAG_PROFILING_ENABLE) - eventFlags |= EVENT_FLAGS_PROFILING_ENABLED; - return eventFlags; -} - ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( - ur_context_handle_t hContext, ur_device_handle_t hDevice, - const ur_queue_properties_t *pProps) - : hContext(hContext), hDevice(hDevice), flags(pProps ? pProps->flags : 0), + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, std::optional index, + event_flags_t eventFlags, ur_queue_flags_t flags) + : hContext(hContext), hDevice(hDevice), commandListManager( hContext, hDevice, hContext->getCommandListCache().getImmediateCommandList( hDevice->ZeDevice, - {true, getZeOrdinal(hDevice), - true /* always enable copy offload */}, - ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, - getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), - getZeIndex(pProps)), - eventFlagsFromQueueFlags(flags), this) {} + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index)), + flags(flags), eventPool(hContext->getEventPoolCache().borrow( + hDevice->Id.value(), eventFlags)) {} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, - ur_native_handle_t hNativeHandle, ur_queue_flags_t flags, bool ownZeQueue) - : hContext(hContext), hDevice(hDevice), flags(flags), - commandListManager( - hContext, hDevice, - raii::command_list_unique_handle( - reinterpret_cast(hNativeHandle), - [ownZeQueue](ze_command_list_handle_t hZeCommandList) { - if (ownZeQueue) { - if (checkL0LoaderTeardown()) { - ZE_CALL_NOCHECK(zeCommandListDestroy, (hZeCommandList)); - } - } - }), - eventFlagsFromQueueFlags(flags), this) {} - -ze_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent( - locked &commandList, ur_event_handle_t *hUserEvent, - ur_command_t commandType) { - return commandList->getSignalEvent(hUserEvent, commandType); -} + raii::command_list_unique_handle commandListHandle, + event_flags_t eventFlags, ur_queue_flags_t flags) + : hContext(hContext), hDevice(hDevice), + commandListManager(hContext, hDevice, std::move(commandListHandle)), + flags(flags), eventPool(hContext->getEventPoolCache().borrow( + hDevice->Id.value(), eventFlags)) {} ur_result_t ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, @@ -145,35 +90,22 @@ ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName, ur_result_t ur_queue_immediate_in_order_t::queueGetNativeHandle( ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { *phNativeQueue = reinterpret_cast( - this->commandListManager.get_no_lock()->getZeCommandList()); + commandListManager.get_no_lock()->getZeCommandList()); return UR_RESULT_SUCCESS; } ur_result_t ur_queue_immediate_in_order_t::queueFinish() { TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::queueFinish"); - auto commandListLocked = commandListManager.lock(); - // TODO: use zeEventHostSynchronize instead? - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::zeCommandListHostSynchronize"); - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); + auto lockedCommandListManager = commandListManager.lock(); - // Free deferred kernels - for (auto &hKernel : submittedKernels) { - UR_CALL(hKernel->release()); - } - submittedKernels.clear(); + ZE2UR_CALL(zeCommandListHostSynchronize, + (lockedCommandListManager->getZeCommandList(), UINT64_MAX)); + UR_CALL(lockedCommandListManager->releaseSubmittedKernels()); return UR_RESULT_SUCCESS; } -void ur_queue_immediate_in_order_t::recordSubmittedKernel( - ur_kernel_handle_t hKernel) { - submittedKernels.push_back(hKernel); - hKernel->RefCount.increment(); -} - ur_result_t ur_queue_immediate_in_order_t::queueFlush() { return UR_RESULT_SUCCESS; } @@ -186,76 +118,6 @@ ur_queue_immediate_in_order_t::~ur_queue_immediate_in_order_t() { } } -ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunch( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueKernelLaunch"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendKernelLaunch( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, - numEventsInWaitList, phEventWaitList, phEvent)); - - recordSubmittedKernel(hKernel); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWait( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueEventsWait"); - - auto commandListLocked = commandListManager.lock(); - if (!numEventsInWaitList && !phEvent) { - // nop - return UR_RESULT_SUCCESS; - } - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_EVENTS_WAIT); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - if (numWaitEvents > 0) { - ZE2UR_CALL( - zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), numWaitEvents, pWaitEvents)); - } - - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierImpl( - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier"); - - auto commandListLocked = commandListManager.lock(); - if (!numEventsInWaitList && !phEvent) { - // nop - return UR_RESULT_SUCCESS; - } - - auto zeSignalEvent = getSignalEvent(commandListLocked, phEvent, - UR_COMMAND_EVENTS_WAIT_WITH_BARRIER); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - ZE2UR_CALL(zeCommandListAppendBarrier, - (commandListLocked->getZeCommandList(), zeSignalEvent, - numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} - ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { @@ -266,677 +128,11 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrier( // need to use barrier if profiling is enabled: see // zeCommandListAppendWaitOnEvents if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { - return enqueueEventsWaitWithBarrierImpl(numEventsInWaitList, - phEventWaitList, phEvent); + return commandListManager.lock()->enqueueEventsWaitWithBarrier( + numEventsInWaitList, phEventWaitList, phEvent); } else { return enqueueEventsWait(numEventsInWaitList, phEventWaitList, phEvent); } } -ur_result_t ur_queue_immediate_in_order_t::enqueueEventsWaitWithBarrierExt( - const ur_exp_enqueue_ext_properties_t *, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, - phEvent); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferRead( - ur_mem_handle_t hMem, bool blockingRead, size_t offset, size_t size, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferRead"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferRead( - hMem, blockingRead, offset, size, pDst, numEventsInWaitList, - phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferWrite( - ur_mem_handle_t hMem, bool blockingWrite, size_t offset, size_t size, - const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferWrite"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferWrite( - hMem, blockingWrite, offset, size, pSrc, numEventsInWaitList, - phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferReadRect( - ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t bufferOrigin, - ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, - size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueMemBufferReadRect"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferReadRect( - hMem, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferWriteRect( - ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t bufferOrigin, - ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, - size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, - void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueMemBufferWriteRect"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferWriteRect( - hMem, blockingWrite, bufferOrigin, hostOrigin, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferCopy( - ur_mem_handle_t hSrc, ur_mem_handle_t hDst, size_t srcOffset, - size_t dstOffset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferCopy"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferCopy( - hSrc, hDst, srcOffset, dstOffset, size, numEventsInWaitList, - phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferCopyRect( - ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, - size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueMemBufferCopyRect"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferCopyRect( - hSrc, hDst, srcOrigin, dstOrigin, region, srcRowPitch, srcSlicePitch, - dstRowPitch, dstSlicePitch, numEventsInWaitList, phEventWaitList, - phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferFill( - ur_mem_handle_t hMem, const void *pPattern, size_t patternSize, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferFill"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendMemBufferFill( - hMem, pPattern, patternSize, offset, size, numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageRead( - ur_mem_handle_t hMem, bool blockingRead, ur_rect_offset_t origin, - ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemImageRead"); - - auto hImage = hMem->getImage(); - - auto commandListLocked = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_IMAGE_READ); - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto [zeImage, zeRegion] = - hImage->getRWRegion(origin, region, rowPitch, slicePitch); - - ZE2UR_CALL(zeCommandListAppendImageCopyToMemory, - (commandListLocked->getZeCommandList(), pDst, zeImage, &zeRegion, - zeSignalEvent, waitListView.num, waitListView.handles)); - - if (blockingRead) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageWrite( - ur_mem_handle_t hMem, bool blockingWrite, ur_rect_offset_t origin, - ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemImageWrite"); - - auto hImage = hMem->getImage(); - - auto commandListLocked = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_IMAGE_WRITE); - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto [zeImage, zeRegion] = - hImage->getRWRegion(origin, region, rowPitch, slicePitch); - - ZE2UR_CALL(zeCommandListAppendImageCopyFromMemory, - (commandListLocked->getZeCommandList(), zeImage, pSrc, &zeRegion, - zeSignalEvent, waitListView.num, waitListView.handles)); - - if (blockingWrite) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemImageCopy( - ur_mem_handle_t hSrc, ur_mem_handle_t hDst, ur_rect_offset_t srcOrigin, - ur_rect_offset_t dstOrigin, ur_rect_region_t region, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemImageWrite"); - - auto hImageSrc = hSrc->getImage(); - auto hImageDst = hDst->getImage(); - - auto commandListLocked = commandListManager.lock(); - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto desc = ur_mem_image_t::getCopyRegions(*hImageSrc, *hImageDst, srcOrigin, - dstOrigin, region); - - auto [zeImageSrc, zeRegionSrc] = desc.src; - auto [zeImageDst, zeRegionDst] = desc.dst; - - ZE2UR_CALL(zeCommandListAppendImageCopyRegion, - (commandListLocked->getZeCommandList(), zeImageDst, zeImageSrc, - &zeRegionDst, &zeRegionSrc, zeSignalEvent, waitListView.num, - waitListView.handles)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemBufferMap( - ur_mem_handle_t hMem, bool blockingMap, ur_map_flags_t mapFlags, - size_t offset, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - void **ppRetMap) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemBufferMap"); - - auto hBuffer = hMem->getBuffer(); - - std::scoped_lock lock(hBuffer->getMutex()); - - auto commandListLocked = commandListManager.lock(); - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_BUFFER_MAP); - - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto pDst = ur_cast(hBuffer->mapHostPtr( - mapFlags, offset, size, [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandListLocked->getZeCommandList(), dst, src, - size, nullptr, waitListView.num, - waitListView.handles)); - waitListView.clear(); - })); - *ppRetMap = pDst; - - if (waitListView) { - // If memory was not migrated, we need to wait on the events here. - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), waitListView.num, - waitListView.handles)); - } - - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - } - - if (blockingMap) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueMemUnmap( - ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueMemUnmap"); - - auto hBuffer = hMem->getBuffer(); - - auto commandListLocked = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_MEM_UNMAP); - - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - // TODO: currently unmapHostPtr deallocates memory immediately, - // since the memory might be used by the user, we need to make sure - // all dependencies are completed. - ZE2UR_CALL(zeCommandListAppendWaitOnEvents, - (commandListLocked->getZeCommandList(), waitListView.num, - waitListView.handles)); - waitListView.clear(); - - hBuffer->unmapHostPtr(pMappedPtr, [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandListLocked->getZeCommandList(), dst, src, size, - nullptr, waitListView.num, waitListView.handles)); - }); - if (zeSignalEvent) { - ZE2UR_CALL(zeCommandListAppendSignalEvent, - (commandListLocked->getZeCommandList(), zeSignalEvent)); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill( - void *pMem, size_t patternSize, const void *pPattern, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMFill"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMFill(pMem, patternSize, pPattern, size, - numEventsInWaitList, phEventWaitList, - phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy( - bool blocking, void *pDst, const void *pSrc, size_t size, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - // TODO: parametrize latency tracking with 'blocking' - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMMemcpy"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMMemcpy(blocking, pDst, pSrc, size, - numEventsInWaitList, - phEventWaitList, phEvent)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMPrefetch( - const void *pMem, size_t size, ur_usm_migration_flags_t flags, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMPrefetch"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMPrefetch( - pMem, size, flags, numEventsInWaitList, phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t -ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, - ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise"); - - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMAdvise(pMem, size, advice, phEvent)); - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFill2D( - void * /*pMem*/, size_t /*pitch*/, size_t /*patternSize*/, - const void * /*pPattern*/, size_t /*width*/, size_t /*height*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMMemcpy2D( - bool blocking, void *pDst, size_t dstPitch, const void *pSrc, - size_t srcPitch, size_t width, size_t height, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMMemcpy2D"); - auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMMemcpy2D( - blocking, pDst, dstPitch, pSrc, srcPitch, width, height, - numEventsInWaitList, phEventWaitList, phEvent)); - return UR_RESULT_SUCCESS; -} - -static void *getGlobalPointerFromModule(ze_module_handle_t hModule, - size_t offset, size_t count, - const char *name) { - // Find global variable pointer - size_t globalVarSize = 0; - void *globalVarPtr = nullptr; - ZE2UR_CALL_THROWS(zeModuleGetGlobalPointer, - (hModule, name, &globalVarSize, &globalVarPtr)); - if (globalVarSize < offset + count) { - setErrorMessage("Write device global variable is out of range.", - UR_RESULT_ERROR_INVALID_VALUE, - static_cast(ZE_RESULT_ERROR_INVALID_ARGUMENT)); - throw UR_RESULT_ERROR_ADAPTER_SPECIFIC; - } - return globalVarPtr; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableWrite( - ur_program_handle_t hProgram, const char *name, bool blockingWrite, - size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableWrite"); - - // TODO: make getZeModuleHandle thread-safe - ze_module_handle_t zeModule = - hProgram->getZeModuleHandle(this->hDevice->ZeDevice); - - // Find global variable pointer - auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); - - // Locking is done inside enqueueUSMMemcpy - return enqueueUSMMemcpy(blockingWrite, ur_cast(globalVarPtr) + offset, - pSrc, count, numEventsInWaitList, phEventWaitList, - phEvent); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableRead( - ur_program_handle_t hProgram, const char *name, bool blockingRead, - size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueDeviceGlobalVariableRead"); - - // TODO: make getZeModuleHandle thread-safe - ze_module_handle_t zeModule = - hProgram->getZeModuleHandle(this->hDevice->ZeDevice); - - // Find global variable pointer - auto globalVarPtr = getGlobalPointerFromModule(zeModule, offset, count, name); - - // Locking is done inside enqueueUSMMemcpy - return enqueueUSMMemcpy(blockingRead, pDst, - ur_cast(globalVarPtr) + offset, count, - numEventsInWaitList, phEventWaitList, phEvent); -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueReadHostPipe( - ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, - bool /*blocking*/, void * /*pDst*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe( - ur_program_handle_t /*hProgram*/, const char * /*pipe_symbol*/, - bool /*blocking*/, void * /*pSrc*/, size_t /*size*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp( - ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, uint32_t, - const ur_event_handle_t *, void **, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp( - ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, uint32_t, - const ur_event_handle_t *, void **, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp( - ur_usm_pool_handle_t, const size_t, - const ur_exp_async_usm_alloc_properties_t *, uint32_t, - const ur_event_handle_t *, void **, ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp( - ur_usm_pool_handle_t, void *, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp( - const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, - const ur_image_desc_t *pDstImageDesc, - const ur_image_format_t *pSrcImageFormat, - const ur_image_format_t *pDstImageFormat, - ur_exp_image_copy_region_t *pCopyRegion, - ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - auto commandListMgr = commandListManager.lock(); - - auto zeSignalEvent = - getSignalEvent(commandListMgr, phEvent, UR_COMMAND_MEM_IMAGE_COPY); - auto waitListView = - getWaitListView(commandListMgr, phEventWaitList, numEventsInWaitList); - - return bindlessImagesHandleCopyFlags( - pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, - pDstImageFormat, pCopyRegion, imageCopyFlags, - commandListMgr->getZeCommandList(), zeSignalEvent, waitListView.num, - waitListView.handles); -} - -ur_result_t -ur_queue_immediate_in_order_t::bindlessImagesWaitExternalSemaphoreExp( - ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasWaitValue*/, - uint64_t /*waitValue*/, uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t -ur_queue_immediate_in_order_t::bindlessImagesSignalExternalSemaphoreExp( - ur_exp_external_semaphore_handle_t /*hSemaphore*/, bool /*hasSignalValue*/, - uint64_t /*signalValue*/, uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp( - ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueCooperativeKernelLaunchExp"); - - UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_NULL_HANDLE); - UR_ASSERT(hKernel->getProgramHandle(), UR_RESULT_ERROR_INVALID_NULL_POINTER); - - UR_ASSERT(workDim > 0, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - UR_ASSERT(workDim < 4, UR_RESULT_ERROR_INVALID_WORK_DIMENSION); - - ze_kernel_handle_t hZeKernel = hKernel->getZeHandle(hDevice); - - std::scoped_lock Lock(hKernel->Mutex); - - auto commandListLocked = commandListManager.lock(); - ze_group_count_t zeThreadGroupDimensions{1, 1, 1}; - uint32_t WG[3]{}; - UR_CALL(calculateKernelWorkDimensions(hZeKernel, hDevice, - zeThreadGroupDimensions, WG, workDim, - pGlobalWorkSize, pLocalWorkSize)); - - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, UR_COMMAND_KERNEL_LAUNCH); - - auto waitListView = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - auto memoryMigrate = [&](void *src, void *dst, size_t size) { - ZE2UR_CALL_THROWS(zeCommandListAppendMemoryCopy, - (commandListLocked->getZeCommandList(), dst, src, size, - nullptr, waitListView.num, waitListView.handles)); - waitListView.clear(); - }; - - // If the offset is {0, 0, 0}, pass NULL instead. - // This allows us to skip setting the offset. - bool hasOffset = false; - for (uint32_t i = 0; i < workDim; ++i) { - hasOffset |= pGlobalWorkOffset[i]; - } - if (!hasOffset) { - pGlobalWorkOffset = NULL; - } - - UR_CALL(hKernel->prepareForSubmission(hContext, hDevice, pGlobalWorkOffset, - workDim, WG[0], WG[1], WG[2], - memoryMigrate)); - - TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::" - "zeCommandListAppendLaunchCooperativeKernel"); - ZE2UR_CALL(zeCommandListAppendLaunchCooperativeKernel, - (commandListLocked->getZeCommandList(), hZeKernel, - &zeThreadGroupDimensions, zeSignalEvent, waitListView.num, - waitListView.handles)); - - recordSubmittedKernel(hKernel); - - postSubmit(hZeKernel, pGlobalWorkOffset); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp( - bool blocking, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueTimestampRecordingExp"); - - auto commandListLocked = commandListManager.lock(); - if (!phEvent) { - return UR_RESULT_ERROR_INVALID_NULL_HANDLE; - } - getSignalEvent(commandListLocked, phEvent, - UR_COMMAND_TIMESTAMP_RECORDING_EXP); - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList); - - (*phEvent)->recordStartTimestamp(); - - auto [timestampPtr, zeSignalEvent] = - (*phEvent)->getEventEndTimestampAndHandle(); - - ZE2UR_CALL(zeCommandListAppendWriteGlobalTimestamp, - (commandListLocked->getZeCommandList(), timestampPtr, - zeSignalEvent, numWaitEvents, pWaitEvents)); - - if (blocking) { - ZE2UR_CALL(zeCommandListHostSynchronize, - (commandListLocked->getZeCommandList(), UINT64_MAX)); - } - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp( - uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent) { - TRACK_SCOPE_LATENCY( - "ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp"); - - auto commandListLocked = commandListManager.lock(); - auto zeSignalEvent = - getSignalEvent(commandListLocked, phEvent, callerCommand); - - auto [pWaitEvents, numWaitEvents] = - getWaitListView(commandListLocked, phEventWaitList, numEventsInWaitList, - additionalWaitEvent); - - ZE2UR_CALL(zeCommandListImmediateAppendCommandListsExp, - (commandListLocked->getZeCommandList(), numCommandLists, - phCommandLists, zeSignalEvent, numWaitEvents, pWaitEvents)); - - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueCommandBufferExp( - ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { - - auto commandListLocked = hCommandBuffer->commandListManager.lock(); - ze_command_list_handle_t commandBufferCommandList = - commandListLocked->getZeCommandList(); - ur_event_handle_t internalEvent = nullptr; - if (phEvent == nullptr) { - phEvent = &internalEvent; - } - ur_event_handle_t executionEvent = - hCommandBuffer->getExecutionEventUnlocked(); - - UR_CALL(enqueueGenericCommandListsExp( - 1, &commandBufferCommandList, phEvent, numEventsInWaitList, - phEventWaitList, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, executionEvent)); - UR_CALL(hCommandBuffer->registerExecutionEventUnlocked(*phEvent)); - if (internalEvent != nullptr) { - internalEvent->release(); - } - return UR_RESULT_SUCCESS; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueKernelLaunchCustomExp( - ur_kernel_handle_t /*hKernel*/, uint32_t /*workDim*/, - const size_t * /*pGlobalWorkOffset*/, const size_t * /*pGlobalWorkSize*/, - const size_t * /*pLocalWorkSize*/, uint32_t /*numPropsInLaunchPropList*/, - const ur_exp_launch_property_t * /*launchPropList*/, - uint32_t /*numEventsInWaitList*/, - const ur_event_handle_t * /*phEventWaitList*/, - ur_event_handle_t * /*phEvent*/) { - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} - -ur_result_t ur_queue_immediate_in_order_t::enqueueNativeCommandExp( - ur_exp_enqueue_native_command_function_t, void *, uint32_t, - const ur_mem_handle_t *, const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { - UR_LOG_LEGACY( - ERR, logger::LegacyMessage("[UR][L0_v2] {} function not implemented!"), - "{} function not implemented!", __FUNCTION__); - - return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; -} } // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp index ef024dd65bd54..fb72c47a2ec0a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.hpp @@ -25,51 +25,35 @@ namespace v2 { -using queue_group_type = ur_device_handle_t_::queue_group_info_t::type; - -struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { +struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ { private: ur_context_handle_t hContext; ur_device_handle_t hDevice; - ur_queue_flags_t flags; - lockable commandListManager; - std::vector submittedKernels; - - wait_list_view - getWaitListView(locked &commandList, - const ur_event_handle_t *phWaitEvents, uint32_t numWaitEvents, - ur_event_handle_t additionalWaitEvent = nullptr); - - ze_event_handle_t getSignalEvent(locked &commandList, - ur_event_handle_t *hUserEvent, - ur_command_t commandType); - - ur_result_t enqueueGenericFillUnlocked( - ur_mem_buffer_t *hBuffer, size_t offset, size_t patternSize, - const void *pPattern, size_t size, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - ur_command_t commandType); - - ur_result_t enqueueGenericCommandListsExp( - uint32_t numCommandLists, ze_command_list_handle_t *phCommandLists, - ur_event_handle_t *phEvent, uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, ur_command_t callerCommand, - ur_event_handle_t additionalWaitEvent); + ur_queue_flags_t flags; + v2::raii::cache_borrowed_event_pool eventPool; - ur_result_t - enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent); + ur_event_handle_t *createEventIfRequested(ur_event_handle_t *phEvent, + ur_command_t commandType) { + if (phEvent == nullptr) { + return nullptr; + } - void recordSubmittedKernel(ur_kernel_handle_t hKernel); + (*phEvent) = eventPool->allocate(); + (*phEvent)->resetQueueAndCommand(this, commandType); + return phEvent; + } public: ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, - const ur_queue_properties_t *); + uint32_t ordinal, + ze_command_queue_priority_t priority, + std::optional index, + event_flags_t eventFlags, + ur_queue_flags_t flags); ur_queue_immediate_in_order_t(ur_context_handle_t, ur_device_handle_t, - ur_native_handle_t, ur_queue_flags_t, - bool ownZeQueue); + raii::command_list_unique_handle, event_flags_t, + ur_queue_flags_t); ~ur_queue_immediate_in_order_t(); @@ -79,164 +63,325 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { ur_native_handle_t *phNativeQueue) override; ur_result_t queueFinish() override; ur_result_t queueFlush() override; - ur_result_t enqueueKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, - const size_t *pGlobalWorkOffset, - const size_t *pGlobalWorkSize, - const size_t *pLocalWorkSize, - uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, - const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; ur_result_t enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) override; - ur_result_t enqueueEventsWaitWithBarrierExt( - const ur_exp_enqueue_ext_properties_t *pProperties, - uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueEventsWait( + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_EVENTS_WAIT)); + } + ur_result_t + enqueueEventsWaitWithBarrierExt(const ur_exp_enqueue_ext_properties_t *, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, + phEvent); + } + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemBufferRead( + hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_READ)); + } + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemBufferWrite( + hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_WRITE)); + } + ur_result_t enqueueMemBufferReadRect( ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemBufferReadRect( + hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_READ)); + } + ur_result_t enqueueMemBufferWriteRect( ur_mem_handle_t hBuffer, bool blockingWrite, ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemBufferWriteRect( + hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_WRITE)); + } + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, size_t srcOffset, size_t dstOffset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemBufferCopy( + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_COPY)); + } + ur_result_t enqueueMemBufferCopyRect( ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemBufferCopyRect( + hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_COPY_RECT)); + } + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_FILL)); + } + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemImageRead( + hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_IMAGE_READ)); + } + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, ur_rect_offset_t origin, ur_rect_region_t region, size_t rowPitch, size_t slicePitch, void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemImageWrite( + hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_IMAGE_WRITE)); + } + ur_result_t enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemImageCopy( + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_IMAGE_COPY)); + } + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, ur_map_flags_t mapFlags, size_t offset, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent, - void **ppRetMap) override; + void **ppRetMap) override { + return commandListManager.lock()->enqueueMemBufferMap( + hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_BUFFER_MAP), ppRetMap); + } + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueMemUnmap( + hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_UNMAP)); + } + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, const void *pPattern, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMFill( + pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_USM_FILL)); + } + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t enqueueUSMFill2D(void *, size_t, size_t, const void *, size_t, - size_t, uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) override; - ur_result_t enqueueUSMMemcpy2D(bool, void *, size_t, const void *, size_t, - size_t, size_t, uint32_t, - const ur_event_handle_t *, - ur_event_handle_t *) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMMemcpy( + blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_USM_MEMCPY)); + } + + ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMFill2D( + pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_USM_FILL_2D)); + } + + ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, + size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMMemcpy2D( + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_USM_MEMCPY_2D)); + } + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, ur_usm_migration_flags_t flags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMPrefetch( + pMem, size, flags, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_USM_PREFETCH)); + } + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMAdvise( + pMem, size, advice, + createEventIfRequested(phEvent, UR_COMMAND_USM_ADVISE)); + } + ur_result_t enqueueDeviceGlobalVariableWrite( ur_program_handle_t hProgram, const char *name, bool blockingWrite, size_t count, size_t offset, const void *pSrc, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueDeviceGlobalVariableWrite( + hProgram, name, blockingWrite, count, offset, pSrc, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE)); + } + ur_result_t enqueueDeviceGlobalVariableRead( ur_program_handle_t hProgram, const char *name, bool blockingRead, size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueDeviceGlobalVariableRead( + hProgram, name, blockingRead, count, offset, pDst, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ)); + } + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pDst, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueReadHostPipe( + hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_READ_HOST_PIPE)); + } + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, const char *pipe_symbol, bool blocking, void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueWriteHostPipe( + hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_WRITE_HOST_PIPE)); + } + ur_result_t enqueueUSMDeviceAllocExp( ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - void **ppMem, ur_event_handle_t *phEvent) override; + void **ppMem, ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMDeviceAllocExp( + pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, + createEventIfRequested(phEvent, + UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP)); + } + ur_result_t enqueueUSMSharedAllocExp( ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - void **ppMem, ur_event_handle_t *phEvent) override; + void **ppMem, ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMSharedAllocExp( + pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, + createEventIfRequested(phEvent, + UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP)); + } + ur_result_t enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, const ur_exp_async_usm_alloc_properties_t *pProperties, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, void **ppMem, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMHostAllocExp( + pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, + createEventIfRequested(phEvent, UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP)); + } + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueUSMFreeExp( + pPool, pMem, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_ENQUEUE_USM_FREE_EXP)); + } + ur_result_t bindlessImagesImageCopyExp( const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, const ur_image_desc_t *pDstImageDesc, @@ -245,45 +390,107 @@ struct ur_queue_immediate_in_order_t : ur_object, public ur_queue_t_ { ur_exp_image_copy_region_t *pCopyRegion, ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->bindlessImagesImageCopyExp( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_MEM_IMAGE_COPY)); + } + ur_result_t bindlessImagesWaitExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, uint64_t waitValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->bindlessImagesWaitExternalSemaphoreExp( + hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, + UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP)); + } + ur_result_t bindlessImagesSignalExternalSemaphoreExp( ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, uint64_t signalValue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->bindlessImagesSignalExternalSemaphoreExp( + hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, + UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP)); + } + ur_result_t enqueueCooperativeKernelLaunchExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueCooperativeKernelLaunchExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_KERNEL_LAUNCH)); + } + ur_result_t enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueTimestampRecordingExp( + blocking, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_TIMESTAMP_RECORDING_EXP)); + } + ur_result_t enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueCommandBufferExp( + hCommandBuffer, numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP)); + } + + ur_result_t enqueueKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_KERNEL_LAUNCH)); + } + ur_result_t enqueueKernelLaunchCustomExp( ur_kernel_handle_t hKernel, uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, const ur_exp_launch_property_t *launchPropList, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, - ur_event_handle_t *phEvent) override; - ur_result_t - enqueueNativeCommandExp(ur_exp_enqueue_native_command_function_t, void *, - uint32_t, const ur_mem_handle_t *, - const ur_exp_enqueue_native_command_properties_t *, - uint32_t, const ur_event_handle_t *, - ur_event_handle_t *) override; + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueKernelLaunchCustomExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_KERNEL_LAUNCH)); + } + + ur_result_t enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + return commandListManager.lock()->enqueueNativeCommandExp( + pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, + numEventsInWaitList, phEventWaitList, + createEventIfRequested(phEvent, UR_COMMAND_ENQUEUE_NATIVE_EXP)); + } }; } // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp new file mode 100644 index 0000000000000..78f1b9f46731c --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.cpp @@ -0,0 +1,222 @@ +//===--------- queue_immediate_in_order.cpp - Level Zero Adapter ---------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "queue_immediate_out_of_order.hpp" +#include "command_buffer.hpp" +#include "kernel.hpp" +#include "memory.hpp" +#include "ur.hpp" + +#include "../common/latency_tracker.hpp" +#include "../helpers/kernel_helpers.hpp" +#include "../image_common.hpp" + +#include "../program.hpp" +#include "../ur_interface_loader.hpp" + +namespace v2 { + +template +std::array, sizeof...(Is)> +createCommandListManagers(ur_context_handle_t hContext, + ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, + std::index_sequence) { + return { + ((void)Is, lockable( + hContext, hDevice, + hContext->getCommandListCache().getImmediateCommandList( + hDevice->ZeDevice, + {true, ordinal, true /* always enable copy offload */}, + ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority)))...}; +} + +template +std::array, N> +createCommandListManagers(ur_context_handle_t hContext, + ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority) { + return createCommandListManagers(hContext, hDevice, ordinal, priority, + std::make_index_sequence{}); +} + +ur_queue_immediate_out_of_order_t::ur_queue_immediate_out_of_order_t( + ur_context_handle_t hContext, ur_device_handle_t hDevice, uint32_t ordinal, + ze_command_queue_priority_t priority, event_flags_t eventFlags, + ur_queue_flags_t flags) + : hContext(hContext), hDevice(hDevice), + commandListManagers(createCommandListManagers( + hContext, hDevice, ordinal, priority)), + eventPool(hContext->getEventPoolCache().borrow(hDevice->Id.value(), + eventFlags)), + flags(flags) { + // TODO: dummy operation to ensure that counter-based events are signaled + // rewrite this using zeCreateCounterBasedEventExt + void *tmpMem = nullptr; + uint32_t tmpPattern = 0; + UR_CALL_THROWS(ur::level_zero::urUSMHostAlloc(hContext, nullptr, nullptr, + sizeof(tmpPattern), &tmpMem)); + + for (size_t i = 0; i < numCommandLists; ++i) { + internalSignalEvents[i] = eventPool->allocate(); + commandListManagers[i].get_no_lock()->enqueueUSMFill( + tmpMem, sizeof(tmpPattern), &tmpPattern, sizeof(tmpPattern), 0, nullptr, + &internalSignalEvents[i]); + ZE2UR_CALL_THROWS( + zeCommandListHostSynchronize, + (commandListManagers[i].get_no_lock()->getZeCommandList(), UINT64_MAX)); + + signalEvents.assign(i, internalSignalEvents[i], false); + } + + UR_CALL_THROWS(ur::level_zero::urUSMFree(hContext, tmpMem)); +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueGetInfo( + ur_queue_info_t propName, size_t propSize, void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + // TODO: consider support for queue properties and size + switch ((uint32_t)propName) { // cast to avoid warnings on EXT enum values + case UR_QUEUE_INFO_CONTEXT: + return ReturnValue(hContext); + case UR_QUEUE_INFO_DEVICE: + return ReturnValue(hDevice); + case UR_QUEUE_INFO_REFERENCE_COUNT: + return ReturnValue(uint32_t{RefCount.load()}); + case UR_QUEUE_INFO_FLAGS: + return ReturnValue(flags); + case UR_QUEUE_INFO_SIZE: + case UR_QUEUE_INFO_DEVICE_DEFAULT: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + case UR_QUEUE_INFO_EMPTY: { + auto isCmdListEmpty = [](ze_command_list_handle_t cmdList) { + auto status = ZE_CALL_NOCHECK(zeCommandListHostSynchronize, (cmdList, 0)); + if (status == ZE_RESULT_SUCCESS) { + return true; + } else if (status == ZE_RESULT_NOT_READY) { + return false; + } else { + throw ze2urResult(status); + } + }; + + bool empty = std::all_of( + commandListManagers.begin(), commandListManagers.end(), + [&](auto &cmdListManager) { + return isCmdListEmpty(cmdListManager.lock()->getZeCommandList()); + }); + + return ReturnValue(empty); + } + default: + UR_LOG(ERR, + "Unsupported ParamName in urQueueGetInfo: " + "ParamName=ParamName={}(0x{})", + propName, logger::toHex(propName)); + return UR_RESULT_ERROR_INVALID_VALUE; + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueGetNativeHandle( + ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { + *phNativeQueue = reinterpret_cast( + commandListManagers[getNextCommandListId()] + .get_no_lock() + ->getZeCommandList()); + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueFinish() { + TRACK_SCOPE_LATENCY("ur_queue_immediate_out_of_order_t::queueFinish"); + + auto lastCommandListId = + commandListIndex.load(std::memory_order_relaxed) % numCommandLists; + + // UR_CALL(commandListManagers[lastCommandListId].lock()->enqueueEventsWait( + // numCommandLists, signalEvents.events.data(), nullptr)); + for (int i = 0; i < numCommandLists; ++i) { + ZE2UR_CALL(zeCommandListHostSynchronize, + (commandListManagers[i].lock()->getZeCommandList(), + UINT64_MAX)); + } + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::queueFlush() { + return UR_RESULT_SUCCESS; +} + +ur_queue_immediate_out_of_order_t::~ur_queue_immediate_out_of_order_t() { + try { + UR_CALL_THROWS(queueFinish()); + } catch (...) { + // Ignore errors during destruction + } +} + +ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWait( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + auto lastCommandListId = + commandListIndex.load(std::memory_order_relaxed) % numCommandLists; + + UR_CALL(commandListManagers[lastCommandListId].lock()->enqueueEventsWait( + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(lastCommandListId, phEvent, + UR_COMMAND_EVENTS_WAIT))); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrierImpl( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + auto lastCommandListId = + commandListIndex.load(std::memory_order_relaxed) % numCommandLists; + + UR_CALL(commandListManagers[lastCommandListId] + .lock() + ->enqueueEventsWaitWithBarrier( + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(lastCommandListId, phEvent, + UR_COMMAND_EVENTS_WAIT))); + + return UR_RESULT_SUCCESS; +} + +ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrier( + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { + // For in-order command lists we don't need a real barrier, just wait for + // requested events in potentially different queues and add a "barrier" + // event signal because it is already guaranteed that previous commands + // in this queue are completed when the signal is started. However, we do + // need to use barrier if profiling is enabled: see + // zeCommandListAppendWaitOnEvents + if ((flags & UR_QUEUE_FLAG_PROFILING_ENABLE) != 0) { + return enqueueEventsWaitWithBarrierImpl(numEventsInWaitList, + phEventWaitList, phEvent); + } else { + return enqueueEventsWait(numEventsInWaitList, phEventWaitList, phEvent); + } +} + +ur_result_t ur_queue_immediate_out_of_order_t::enqueueEventsWaitWithBarrierExt( + const ur_exp_enqueue_ext_properties_t *, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + return enqueueEventsWaitWithBarrier(numEventsInWaitList, phEventWaitList, + phEvent); +} + +} // namespace v2 diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp new file mode 100644 index 0000000000000..4f4d0cd8bccc9 --- /dev/null +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_out_of_order.hpp @@ -0,0 +1,612 @@ +//===--------- queue_immediate_in_order.hpp - Level Zero Adapter ---------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +#pragma once + +#include "../common.hpp" +#include "../device.hpp" + +#include "context.hpp" +#include "event.hpp" +#include "event_pool_cache.hpp" +#include "memory.hpp" +#include "queue_api.hpp" + +#include "ur/ur.hpp" + +#include "command_list_manager.hpp" +#include "lockable.hpp" + +namespace v2 { + +template struct signal_events_handle { + std::array events; + std::array isExternal; + + signal_events_handle() { + for (size_t i = 0; i < N; ++i) { + events[i] = nullptr; + isExternal[i] = false; + } + } + + ~signal_events_handle() { + for (size_t i = 0; i < N; ++i) { + if (isExternal[i] && events[i] != nullptr) { + events[i]->release(); + } + } + } + + void assign(size_t index, ur_event_handle_t event, bool external) { + if (isExternal[index] && events[index] != nullptr) { + events[index]->release(); + } + + events[index] = event; + isExternal[index] = external; + + if (isExternal[index]) { + events[index]->retain(); + } + } +}; + +struct ur_queue_immediate_out_of_order_t : ur_object, ur_queue_t_ { +private: + static constexpr size_t numCommandLists = 4; + + ur_context_handle_t hContext; + ur_device_handle_t hDevice; + + signal_events_handle signalEvents; + std::atomic commandListIndex = 0; + std::array internalSignalEvents; + + std::array, numCommandLists> + commandListManagers; + + v2::raii::cache_borrowed_event_pool eventPool; + + ur_queue_flags_t flags; + + uint32_t getNextCommandListId() { + return commandListIndex.fetch_add(1, std::memory_order_relaxed) % + numCommandLists; + } + + ur_event_handle_t *createOrForwardSignalEvent(uint32_t cmdListId, + ur_event_handle_t *phEvent, + ur_command_t commandType) { + if (phEvent == nullptr) { + //phEvent = &internalSignalEvents[cmdListId]; + //signalEvents.assign(cmdListId, *phEvent, false); + return nullptr; + } else { + (*phEvent) = eventPool->allocate(); + //signalEvents.assign(cmdListId, *phEvent, true); + } + + (*phEvent)->resetQueueAndCommand(this, commandType); + return phEvent; + } + + ur_result_t + enqueueEventsWaitWithBarrierImpl(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent); + +public: + ur_queue_immediate_out_of_order_t(ur_context_handle_t, ur_device_handle_t, + uint32_t ordinal, + ze_command_queue_priority_t priority, + event_flags_t eventFlags, + ur_queue_flags_t flags); + + ~ur_queue_immediate_out_of_order_t(); + + ur_result_t queueGetInfo(ur_queue_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) override; + ur_result_t queueGetNativeHandle(ur_queue_native_desc_t *pDesc, + ur_native_handle_t *phNativeQueue) override; + ur_result_t queueFinish() override; + ur_result_t queueFlush() override; + + ur_result_t enqueueEventsWait(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + ur_result_t + enqueueEventsWaitWithBarrier(uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + ur_result_t enqueueEventsWaitWithBarrierExt( + const ur_exp_enqueue_ext_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override; + + ur_result_t enqueueMemBufferRead(ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferRead( + hBuffer, blockingRead, offset, size, pDst, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_READ)); + } + + ur_result_t enqueueMemBufferWrite(ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, + const void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferWrite( + hBuffer, blockingWrite, offset, size, pSrc, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_WRITE)); + } + + ur_result_t enqueueMemBufferReadRect( + ur_mem_handle_t hBuffer, bool blockingRead, ur_rect_offset_t bufferOrigin, + ur_rect_offset_t hostOrigin, ur_rect_region_t region, + size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, + size_t hostSlicePitch, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferReadRect( + hBuffer, blockingRead, bufferOrigin, hostOrigin, region, bufferRowPitch, + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_READ)); + } + + ur_result_t enqueueMemBufferWriteRect( + ur_mem_handle_t hBuffer, bool blockingWrite, + ur_rect_offset_t bufferOrigin, ur_rect_offset_t hostOrigin, + ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, + size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferWriteRect( + hBuffer, blockingWrite, bufferOrigin, hostOrigin, region, + bufferRowPitch, bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_WRITE)); + } + + ur_result_t enqueueMemBufferCopy(ur_mem_handle_t hBufferSrc, + ur_mem_handle_t hBufferDst, size_t srcOffset, + size_t dstOffset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferCopy( + hBufferSrc, hBufferDst, srcOffset, dstOffset, size, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_COPY)); + } + + ur_result_t enqueueMemBufferCopyRect( + ur_mem_handle_t hBufferSrc, ur_mem_handle_t hBufferDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, + size_t dstRowPitch, size_t dstSlicePitch, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferCopyRect( + hBufferSrc, hBufferDst, srcOrigin, dstOrigin, region, srcRowPitch, + srcSlicePitch, dstRowPitch, dstSlicePitch, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_COPY_RECT)); + } + + ur_result_t enqueueMemBufferFill(ur_mem_handle_t hBuffer, + const void *pPattern, size_t patternSize, + size_t offset, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferFill( + hBuffer, pPattern, patternSize, offset, size, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_FILL)); + } + + ur_result_t enqueueMemImageRead(ur_mem_handle_t hImage, bool blockingRead, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pDst, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemImageRead( + hImage, blockingRead, origin, region, rowPitch, slicePitch, pDst, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_IMAGE_READ)); + } + + ur_result_t enqueueMemImageWrite(ur_mem_handle_t hImage, bool blockingWrite, + ur_rect_offset_t origin, + ur_rect_region_t region, size_t rowPitch, + size_t slicePitch, void *pSrc, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemImageWrite( + hImage, blockingWrite, origin, region, rowPitch, slicePitch, pSrc, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_IMAGE_WRITE)); + } + + ur_result_t + enqueueMemImageCopy(ur_mem_handle_t hImageSrc, ur_mem_handle_t hImageDst, + ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, + ur_rect_region_t region, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemImageCopy( + hImageSrc, hImageDst, srcOrigin, dstOrigin, region, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_IMAGE_COPY)); + } + + ur_result_t enqueueMemBufferMap(ur_mem_handle_t hBuffer, bool blockingMap, + ur_map_flags_t mapFlags, size_t offset, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent, + void **ppRetMap) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemBufferMap( + hBuffer, blockingMap, mapFlags, offset, size, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_BUFFER_MAP), + ppRetMap); + } + + ur_result_t enqueueMemUnmap(ur_mem_handle_t hMem, void *pMappedPtr, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueMemUnmap( + hMem, pMappedPtr, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, UR_COMMAND_MEM_UNMAP)); + } + + ur_result_t enqueueUSMFill(void *pMem, size_t patternSize, + const void *pPattern, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMFill( + pMem, patternSize, pPattern, size, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, UR_COMMAND_USM_FILL)); + } + + ur_result_t enqueueUSMMemcpy(bool blocking, void *pDst, const void *pSrc, + size_t size, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMMemcpy( + blocking, pDst, pSrc, size, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, UR_COMMAND_USM_MEMCPY)); + } + + ur_result_t enqueueUSMFill2D(void *pMem, size_t pitch, size_t patternSize, + const void *pPattern, size_t width, + size_t height, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMFill2D( + pMem, pitch, patternSize, pPattern, width, height, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, UR_COMMAND_USM_FILL_2D)); + } + + ur_result_t enqueueUSMMemcpy2D(bool blocking, void *pDst, size_t dstPitch, + const void *pSrc, size_t srcPitch, + size_t width, size_t height, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMMemcpy2D( + blocking, pDst, dstPitch, pSrc, srcPitch, width, height, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_USM_MEMCPY_2D)); + } + + ur_result_t enqueueUSMPrefetch(const void *pMem, size_t size, + ur_usm_migration_flags_t flags, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMPrefetch( + pMem, size, flags, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_USM_PREFETCH)); + } + + ur_result_t enqueueUSMAdvise(const void *pMem, size_t size, + ur_usm_advice_flags_t advice, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMAdvise( + pMem, size, advice, + createOrForwardSignalEvent(cmdListId, phEvent, UR_COMMAND_USM_ADVISE)); + } + + ur_result_t enqueueDeviceGlobalVariableWrite( + ur_program_handle_t hProgram, const char *name, bool blockingWrite, + size_t count, size_t offset, const void *pSrc, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId] + .lock() + ->enqueueDeviceGlobalVariableWrite( + hProgram, name, blockingWrite, count, offset, pSrc, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent( + cmdListId, phEvent, UR_COMMAND_DEVICE_GLOBAL_VARIABLE_WRITE)); + } + + ur_result_t enqueueDeviceGlobalVariableRead( + ur_program_handle_t hProgram, const char *name, bool blockingRead, + size_t count, size_t offset, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId] + .lock() + ->enqueueDeviceGlobalVariableRead( + hProgram, name, blockingRead, count, offset, pDst, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_DEVICE_GLOBAL_VARIABLE_READ)); + } + + ur_result_t enqueueReadHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pDst, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueReadHostPipe( + hProgram, pipe_symbol, blocking, pDst, size, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_READ_HOST_PIPE)); + } + + ur_result_t enqueueWriteHostPipe(ur_program_handle_t hProgram, + const char *pipe_symbol, bool blocking, + void *pSrc, size_t size, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueWriteHostPipe( + hProgram, pipe_symbol, blocking, pSrc, size, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_WRITE_HOST_PIPE)); + } + + ur_result_t enqueueUSMDeviceAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMDeviceAllocExp( + pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP)); + } + + ur_result_t enqueueUSMSharedAllocExp( + ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + void **ppMem, ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMSharedAllocExp( + pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP)); + } + + ur_result_t + enqueueUSMHostAllocExp(ur_usm_pool_handle_t pPool, const size_t size, + const ur_exp_async_usm_alloc_properties_t *pProperties, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, void **ppMem, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMHostAllocExp( + pPool, size, pProperties, numEventsInWaitList, phEventWaitList, ppMem, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP)); + } + + ur_result_t enqueueUSMFreeExp(ur_usm_pool_handle_t pPool, void *pMem, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueUSMFreeExp( + pPool, pMem, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_ENQUEUE_USM_FREE_EXP)); + } + + ur_result_t bindlessImagesImageCopyExp( + const void *pSrc, void *pDst, const ur_image_desc_t *pSrcImageDesc, + const ur_image_desc_t *pDstImageDesc, + const ur_image_format_t *pSrcImageFormat, + const ur_image_format_t *pDstImageFormat, + ur_exp_image_copy_region_t *pCopyRegion, + ur_exp_image_copy_flags_t imageCopyFlags, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->bindlessImagesImageCopyExp( + pSrc, pDst, pSrcImageDesc, pDstImageDesc, pSrcImageFormat, + pDstImageFormat, pCopyRegion, imageCopyFlags, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_MEM_IMAGE_COPY)); + } + + ur_result_t bindlessImagesWaitExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasWaitValue, + uint64_t waitValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId] + .lock() + ->bindlessImagesWaitExternalSemaphoreExp( + hSemaphore, hasWaitValue, waitValue, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_EXTERNAL_SEMAPHORE_WAIT_EXP)); + } + + ur_result_t bindlessImagesSignalExternalSemaphoreExp( + ur_exp_external_semaphore_handle_t hSemaphore, bool hasSignalValue, + uint64_t signalValue, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId] + .lock() + ->bindlessImagesSignalExternalSemaphoreExp( + hSemaphore, hasSignalValue, signalValue, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent( + cmdListId, phEvent, UR_COMMAND_EXTERNAL_SEMAPHORE_SIGNAL_EXP)); + } + + ur_result_t enqueueCooperativeKernelLaunchExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId] + .lock() + ->enqueueCooperativeKernelLaunchExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, + pLocalWorkSize, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_KERNEL_LAUNCH)); + } + + ur_result_t + enqueueTimestampRecordingExp(bool blocking, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueTimestampRecordingExp( + blocking, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_TIMESTAMP_RECORDING_EXP)); + } + + ur_result_t + enqueueCommandBufferExp(ur_exp_command_buffer_handle_t hCommandBuffer, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueCommandBufferExp( + hCommandBuffer, numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP)); + } + + ur_result_t enqueueKernelLaunch(ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, + const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueKernelLaunch( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_KERNEL_LAUNCH)); + } + + ur_result_t enqueueKernelLaunchCustomExp( + ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numPropsInLaunchPropList, + const ur_exp_launch_property_t *launchPropList, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueKernelLaunchCustomExp( + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numPropsInLaunchPropList, launchPropList, numEventsInWaitList, + phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_KERNEL_LAUNCH)); + } + + ur_result_t enqueueNativeCommandExp( + ur_exp_enqueue_native_command_function_t pfnNativeEnqueue, void *data, + uint32_t numMemsInMemList, const ur_mem_handle_t *phMemList, + const ur_exp_enqueue_native_command_properties_t *pProperties, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) override { + auto cmdListId = getNextCommandListId(); + return commandListManagers[cmdListId].lock()->enqueueNativeCommandExp( + pfnNativeEnqueue, data, numMemsInMemList, phMemList, pProperties, + numEventsInWaitList, phEventWaitList, + createOrForwardSignalEvent(cmdListId, phEvent, + UR_COMMAND_ENQUEUE_NATIVE_EXP)); + } +}; + +} // namespace v2 diff --git a/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt b/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt index f3ef47a2f947c..ea73facd464f4 100644 --- a/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt +++ b/unified-runtime/test/adapters/level_zero/v2/CMakeLists.txt @@ -49,8 +49,8 @@ else() ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_normal.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event_provider_counter.cpp ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/event.cpp - ${PROJECT_SOURCE_DIR}/source/adapters/level_zero/v2/queue_api.cpp ) + target_include_directories(test-adapter-level_zero_event_pool PUBLIC ${PROJECT_SOURCE_DIR}/source/ur) endif() add_adapter_test(level_zero_memory_residency diff --git a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp index b1a6dd8aad0a2..c8aeb6942586a 100644 --- a/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp +++ b/unified-runtime/test/adapters/level_zero/v2/event_pool_test.cpp @@ -175,7 +175,7 @@ TEST_P(EventPoolTest, Basic) { auto pool = cache->borrow(device->Id.value(), getParam().flags); first = pool->allocate(); - first->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + first->resetQueueAndCommand(nullptr, UR_COMMAND_KERNEL_LAUNCH); zeFirst = first->getZeEvent(); urEventRelease(first); @@ -186,7 +186,7 @@ TEST_P(EventPoolTest, Basic) { auto pool = cache->borrow(device->Id.value(), getParam().flags); second = pool->allocate(); - first->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + first->resetQueueAndCommand(nullptr, UR_COMMAND_KERNEL_LAUNCH); zeSecond = second->getZeEvent(); urEventRelease(second); @@ -206,7 +206,7 @@ TEST_P(EventPoolTest, Threaded) { std::vector events; for (int i = 0; i < 100; ++i) { events.push_back(pool->allocate()); - events.back()->resetQueueAndCommand(&queue->get(), + events.back()->resetQueueAndCommand(nullptr, UR_COMMAND_KERNEL_LAUNCH); } for (int i = 0; i < 100; ++i) { @@ -226,7 +226,7 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) { std::list events; for (int i = 0; i < 128; ++i) { auto event = pool->allocate(); - event->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + event->resetQueueAndCommand(nullptr, UR_COMMAND_KERNEL_LAUNCH); events.push_back(event); } auto frontZeHandle = events.front()->getZeEvent(); @@ -236,7 +236,7 @@ TEST_P(EventPoolTest, ProviderNormalUseMostFreePool) { } for (int i = 0; i < 8; ++i) { auto e = pool->allocate(); - e->resetQueueAndCommand(&queue->get(), UR_COMMAND_KERNEL_LAUNCH); + e->resetQueueAndCommand(nullptr, UR_COMMAND_KERNEL_LAUNCH); events.push_back(e); }