diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp index a6541cff99adf..855e6ca0ca963 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.cpp @@ -66,12 +66,42 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, const ur_exp_command_buffer_desc_t *desc) - : commandListManager( + : isUpdatable(desc ? desc->isUpdatable : false), + isInOrder(desc ? desc->isInOrder : false), + commandListManager( context, device, std::forward(commandList), - v2::EVENT_FLAGS_COUNTER, nullptr), - isUpdatable(desc ? desc->isUpdatable : false), context(context), - device(device) {} + isInOrder ? v2::EVENT_FLAGS_COUNTER : 0, nullptr, + PoolCacheType::Regular), + context(context), device(device) {} + +ur_exp_command_buffer_sync_point_t +ur_exp_command_buffer_handle_t_::getSyncPoint(ur_event_handle_t event) { + if (syncPoints.size() >= + std::numeric_limits::max()) { + UR_LOG(ERR, "Too many sync points"); + throw UR_RESULT_ERROR_OUT_OF_RESOURCES; + } + syncPoints.push_back(event); + return static_cast(syncPoints.size() - 1); +} + +ur_event_handle_t *ur_exp_command_buffer_handle_t_::getWaitListFromSyncPoints( + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numSyncPointsInWaitList) { + if (numSyncPointsInWaitList == 0) { + return nullptr; + } + syncPointWaitList.resize(numSyncPointsInWaitList); + for (uint32_t i = 0; i < numSyncPointsInWaitList; ++i) { + if (pSyncPointWaitList[i] >= syncPoints.size()) { + UR_LOG(ERR, "Invalid sync point"); + throw UR_RESULT_ERROR_INVALID_VALUE; + } + syncPointWaitList[i] = syncPoints[pSyncPointWaitList[i]]; + } + return syncPointWaitList.data(); +} ur_result_t ur_exp_command_buffer_handle_t_::createCommandHandle( locked &commandListLocked, @@ -97,6 +127,17 @@ ur_result_t ur_exp_command_buffer_handle_t_::finalizeCommandBuffer() { // It is not allowed to append to command list from multiple threads. auto commandListLocked = commandListManager.lock(); UR_ASSERT(!isFinalized, UR_RESULT_ERROR_INVALID_OPERATION); + + if (!isInOrder) { + ZE2UR_CALL(zeCommandListAppendBarrier, + (commandListLocked->getZeCommandList(), nullptr, 0, nullptr)); + for (auto &event : syncPoints) { + ZE2UR_CALL(zeCommandListAppendEventReset, + (commandListLocked->getZeCommandList(), event->getZeEvent())); + } + ZE2UR_CALL(zeCommandListAppendBarrier, + (commandListLocked->getZeCommandList(), nullptr, 0, nullptr)); + } // Close the command lists and have them ready for dispatch. ZE2UR_CALL(zeCommandListClose, (commandListLocked->getZeCommandList())); isFinalized = true; @@ -123,6 +164,9 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() { if (currentExecution) { currentExecution->release(); } + for (auto &event : syncPoints) { + event->release(); + } } ur_result_t ur_exp_command_buffer_handle_t_::applyUpdateCommands( @@ -175,7 +219,7 @@ urCommandBufferCreateExp(ur_context_handle_t context, ur_device_handle_t device, uint32_t queueGroupOrdinal = device->QueueGroup[queue_group_type::Compute].ZeOrdinal; v2::command_list_desc_t listDesc; - listDesc.IsInOrder = true; + listDesc.IsInOrder = commandBufferDesc->isInOrder; listDesc.Ordinal = queueGroupOrdinal; listDesc.CopyOffloadEnable = true; listDesc.Mutable = commandBufferDesc->isUpdatable; @@ -224,11 +268,11 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( uint32_t workDim, const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, const size_t *pLocalWorkSize, uint32_t numKernelAlternatives, ur_kernel_handle_t *kernelAlternatives, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*syncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *syncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*eventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*retSyncPoint*/, + ur_exp_command_buffer_sync_point_t *retSyncPoint, ur_event_handle_t * /*event*/, ur_exp_command_buffer_command_handle_t *command) try { @@ -246,9 +290,21 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( commandListLocked, hKernel, workDim, pGlobalWorkSize, numKernelAlternatives, kernelAlternatives, command)); } + auto eventsWaitList = commandBuffer->getWaitListFromSyncPoints( + syncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (retSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendKernelLaunch( - hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, 0, - nullptr, nullptr)); + hKernel, workDim, pGlobalWorkOffset, pGlobalWorkSize, pLocalWorkSize, + numSyncPointsInWaitList, eventsWaitList, event)); + + if (retSyncPoint != nullptr) { + *retSyncPoint = commandBuffer->getSyncPoint(signalEvent); + } + return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -256,19 +312,29 @@ ur_result_t urCommandBufferAppendKernelLaunchExp( ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pDst, const void *pSrc, - size_t size, uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + size_t size, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMMemcpy(false, pDst, pSrc, size, 0, - nullptr, nullptr)); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } + UR_CALL(commandListLocked->appendUSMMemcpy( + false, pDst, pSrc, size, numSyncPointsInWaitList, eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -277,11 +343,11 @@ ur_result_t urCommandBufferAppendUSMMemcpyExp( ur_result_t urCommandBufferAppendMemBufferCopyExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hSrcMem, ur_mem_handle_t hDstMem, size_t srcOffset, size_t dstOffset, size_t size, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { @@ -289,9 +355,20 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( // sync mechanic can be ignored, because all lists are in-order // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendMemBufferCopy( - hSrcMem, hDstMem, srcOffset, dstOffset, size, 0, nullptr, nullptr)); + hSrcMem, hDstMem, srcOffset, dstOffset, size, numSyncPointsInWaitList, + eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -300,11 +377,11 @@ ur_result_t urCommandBufferAppendMemBufferCopyExp( ur_result_t urCommandBufferAppendMemBufferWriteExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, size_t offset, size_t size, const void *pSrc, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { @@ -312,9 +389,20 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( // sync mechanic can be ignored, because all lists are in-order // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendMemBufferWrite(hBuffer, false, offset, size, - pSrc, 0, nullptr, nullptr)); + pSrc, numSyncPointsInWaitList, + eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -322,21 +410,31 @@ ur_result_t urCommandBufferAppendMemBufferWriteExp( ur_result_t urCommandBufferAppendMemBufferReadExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, - size_t offset, size_t size, void *pDst, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + size_t offset, size_t size, void *pDst, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendMemBufferRead(hBuffer, false, offset, size, - pDst, 0, nullptr, nullptr)); + pDst, numSyncPointsInWaitList, + eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -347,11 +445,11 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( ur_mem_handle_t hDstMem, ur_rect_offset_t srcOrigin, ur_rect_offset_t dstOrigin, ur_rect_region_t region, size_t srcRowPitch, size_t srcSlicePitch, size_t dstRowPitch, size_t dstSlicePitch, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { @@ -359,10 +457,21 @@ ur_result_t urCommandBufferAppendMemBufferCopyRectExp( // sync mechanic can be ignored, because all lists are in-order // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendMemBufferCopyRect( hSrcMem, hDstMem, srcOrigin, dstOrigin, region, srcRowPitch, - srcSlicePitch, dstRowPitch, dstSlicePitch, 0, nullptr, nullptr)); + srcSlicePitch, dstRowPitch, dstSlicePitch, numSyncPointsInWaitList, + eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -373,11 +482,11 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pSrc, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { @@ -385,11 +494,21 @@ ur_result_t urCommandBufferAppendMemBufferWriteRectExp( // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendMemBufferWriteRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, 0, nullptr, - nullptr)); + bufferSlicePitch, hostRowPitch, hostSlicePitch, pSrc, + numSyncPointsInWaitList, eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -400,11 +519,11 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( ur_rect_offset_t bufferOffset, ur_rect_offset_t hostOffset, ur_rect_region_t region, size_t bufferRowPitch, size_t bufferSlicePitch, size_t hostRowPitch, size_t hostSlicePitch, void *pDst, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { @@ -412,11 +531,21 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( // Responsibility of UMD to offload to copy engine auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendMemBufferReadRect( hBuffer, false, bufferOffset, hostOffset, region, bufferRowPitch, - bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, 0, nullptr, - nullptr)); + bufferSlicePitch, hostRowPitch, hostSlicePitch, pDst, + numSyncPointsInWaitList, eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -425,17 +554,28 @@ ur_result_t urCommandBufferAppendMemBufferReadRectExp( ur_result_t urCommandBufferAppendUSMFillExp( ur_exp_command_buffer_handle_t hCommandBuffer, void *pMemory, const void *pPattern, size_t patternSize, size_t size, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendUSMFill(pMemory, patternSize, pPattern, size, - 0, nullptr, nullptr)); + numSyncPointsInWaitList, + eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -444,18 +584,29 @@ ur_result_t urCommandBufferAppendUSMFillExp( ur_result_t urCommandBufferAppendMemBufferFillExp( ur_exp_command_buffer_handle_t hCommandBuffer, ur_mem_handle_t hBuffer, const void *pPattern, size_t patternSize, size_t offset, size_t size, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } UR_CALL(commandListLocked->appendMemBufferFill( - hBuffer, pPattern, patternSize, offset, size, 0, nullptr, nullptr)); + hBuffer, pPattern, patternSize, offset, size, numSyncPointsInWaitList, + eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -464,20 +615,30 @@ ur_result_t urCommandBufferAppendMemBufferFillExp( ur_result_t urCommandBufferAppendUSMPrefetchExp( ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, size_t size, ur_usm_migration_flags_t flags, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMPrefetch(pMemory, size, flags, 0, nullptr, - nullptr)); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } + UR_CALL(commandListLocked->appendUSMPrefetch( + pMemory, size, flags, numSyncPointsInWaitList, eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -485,19 +646,29 @@ ur_result_t urCommandBufferAppendUSMPrefetchExp( ur_result_t urCommandBufferAppendUSMAdviseExp( ur_exp_command_buffer_handle_t hCommandBuffer, const void *pMemory, - size_t size, ur_usm_advice_flags_t advice, - uint32_t /*numSyncPointsInWaitList*/, - const ur_exp_command_buffer_sync_point_t * /*pSyncPointWaitList*/, + size_t size, ur_usm_advice_flags_t advice, uint32_t numSyncPointsInWaitList, + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, uint32_t /*numEventsInWaitList*/, const ur_event_handle_t * /*phEventWaitList*/, - ur_exp_command_buffer_sync_point_t * /*pSyncPoint*/, + ur_exp_command_buffer_sync_point_t *pSyncPoint, ur_event_handle_t * /*phEvent*/, ur_exp_command_buffer_command_handle_t * /*phCommand*/) try { // the same issue as in urCommandBufferAppendKernelLaunchExp auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMAdvise(pMemory, size, advice, nullptr)); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } + UR_CALL(commandListLocked->appendUSMAdvise( + pMemory, size, advice, numSyncPointsInWaitList, eventsWaitList, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } catch (...) { return exceptionToResult(std::current_exception()); @@ -518,7 +689,7 @@ urCommandBufferGetInfoExp(ur_exp_command_buffer_handle_t hCommandBuffer, Descriptor.stype = UR_STRUCTURE_TYPE_EXP_COMMAND_BUFFER_DESC; Descriptor.pNext = nullptr; Descriptor.isUpdatable = hCommandBuffer->isUpdatable; - Descriptor.isInOrder = true; + Descriptor.isInOrder = hCommandBuffer->isInOrder; Descriptor.enableProfiling = hCommandBuffer->isProfilingEnabled; return ReturnValue(Descriptor); @@ -538,22 +709,28 @@ ur_result_t urCommandBufferAppendNativeCommandExp( uint32_t numSyncPointsInWaitList, const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, ur_exp_command_buffer_sync_point_t *pSyncPoint) { - // sync mechanic can be ignored, because all lists are in-order - (void)numSyncPointsInWaitList; - (void)pSyncPointWaitList; - (void)pSyncPoint; - // Barrier on all commands before user defined commands. auto commandListLocked = hCommandBuffer->commandListManager.lock(); - UR_CALL(commandListLocked->appendBarrier(0, nullptr, nullptr)); + auto eventsWaitList = hCommandBuffer->getWaitListFromSyncPoints( + pSyncPointWaitList, numSyncPointsInWaitList); + ur_event_handle_t *event = nullptr; + ur_event_handle_t signalEvent = nullptr; + if (pSyncPoint != nullptr) { + event = &signalEvent; + } + UR_CALL(commandListLocked->appendBarrier(numSyncPointsInWaitList, + eventsWaitList, nullptr)); // Call user-defined function immediately pfnNativeCommand(pData); // Barrier on all commands after user defined commands. - UR_CALL(commandListLocked->appendBarrier(0, nullptr, nullptr)); + UR_CALL(commandListLocked->appendBarrier(0, nullptr, event)); + if (pSyncPoint != nullptr) { + *pSyncPoint = hCommandBuffer->getSyncPoint(signalEvent); + } return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp b/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp index 91f7df69c3d05..155c8c3b4a3a6 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_buffer.hpp @@ -32,14 +32,17 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object { ur_result_t registerExecutionEventUnlocked(ur_event_handle_t nextExecutionEvent); - lockable commandListManager; - - ur_result_t finalizeCommandBuffer(); // Indicates if command-buffer commands can be updated after it is closed. const bool isUpdatable = false; + const bool isInOrder = true; + // Command-buffer profiling is enabled. const bool isProfilingEnabled = false; + lockable commandListManager; + + ur_result_t finalizeCommandBuffer(); + ur_result_t createCommandHandle(locked &commandListLocked, ur_kernel_handle_t hKernel, uint32_t workDim, @@ -51,11 +54,25 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object { uint32_t numUpdateCommands, const ur_exp_command_buffer_update_kernel_launch_desc_t *updateCommands); + ur_exp_command_buffer_sync_point_t getSyncPoint(ur_event_handle_t event); + ur_event_handle_t *getWaitListFromSyncPoints( + const ur_exp_command_buffer_sync_point_t *pSyncPointWaitList, + uint32_t numSyncPointsInWaitList); + private: + // Stores all sync points that are created by the command buffer. + std::vector syncPoints; + + // Temporary storage for sync points that are passed to function that require + // array of events. This is used to avoid allocating a new memory every time. + std::vector syncPointWaitList; + const ur_context_handle_t context; const ur_device_handle_t device; + std::vector> commandHandles; + // Indicates if command-buffer was finalized. bool isFinalized = false; diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index d6f865d80b5c3..d93d88847c9b0 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -18,10 +18,11 @@ ur_command_list_manager::ur_command_list_manager( ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, v2::event_flags_t flags, - ur_queue_t_ *queue) - : context(context), device(device), - eventPool(context->getEventPoolCache().borrow(device->Id.value(), flags)), - zeCommandList(std::move(commandList)), queue(queue) { + ur_queue_t_ *queue, PoolCacheType listType) + : context(context), device(device), zeCommandList(std::move(commandList)), + queue(queue) { + auto &eventPoolTmp = context->getEventPoolCache(listType); + eventPool = eventPoolTmp.borrow(device->Id.value(), flags); UR_CALL_THROWS(ur::level_zero::urContextRetain(context)); UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device)); } @@ -320,17 +321,18 @@ ur_result_t ur_command_list_manager::appendUSMPrefetch( return UR_RESULT_SUCCESS; } -ur_result_t -ur_command_list_manager::appendUSMAdvise(const void *pMem, size_t size, - ur_usm_advice_flags_t advice, - ur_event_handle_t *phEvent) { +ur_result_t ur_command_list_manager::appendUSMAdvise( + const void *pMem, size_t size, ur_usm_advice_flags_t advice, + uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, + ur_event_handle_t *phEvent) { TRACK_SCOPE_LATENCY("ur_command_list_manager::appendUSMAdvise"); auto zeAdvice = ur_cast(advice); auto zeSignalEvent = getSignalEvent(phEvent, UR_COMMAND_USM_ADVISE); - auto [pWaitEvents, numWaitEvents] = getWaitListView(nullptr, 0); + auto [pWaitEvents, numWaitEvents] = + getWaitListView(phEventWaitList, numEventsInWaitList); if (pWaitEvents) { ZE2UR_CALL(zeCommandListAppendWaitOnEvents, diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp index 74c3f85ea3643..af23248dfc4f8 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.hpp @@ -11,6 +11,7 @@ #include "command_list_cache.hpp" #include "common.hpp" +#include "context.hpp" #include "event_pool_cache.hpp" #include "memory.hpp" #include "queue_api.hpp" @@ -39,7 +40,8 @@ struct ur_command_list_manager { ur_command_list_manager(ur_context_handle_t context, ur_device_handle_t device, v2::raii::command_list_unique_handle &&commandList, - v2::event_flags_t flags, ur_queue_t_ *queue); + v2::event_flags_t flags, ur_queue_t_ *queue, + PoolCacheType listType); ur_command_list_manager(const ur_command_list_manager &src) = delete; ur_command_list_manager(ur_command_list_manager &&src) = default; @@ -128,6 +130,8 @@ struct ur_command_list_manager { ur_result_t appendUSMAdvise(const void *pMem, size_t size, ur_usm_advice_flags_t advice, + uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent); ur_result_t appendBarrier(uint32_t numEventsInWaitList, diff --git a/unified-runtime/source/adapters/level_zero/v2/context.cpp b/unified-runtime/source/adapters/level_zero/v2/context.cpp index 050511d379b03..fe12d80365848 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.cpp @@ -53,16 +53,25 @@ ur_context_handle_t_::ur_context_handle_t_(ze_context_handle_t hContext, commandListCache(hContext, {phDevices[0]->Platform->ZeCopyOffloadExtensionSupported, phDevices[0]->Platform->ZeMutableCmdListExt.Supported}), - eventPoolCache( + eventPoolCacheImmediate( this, phDevices[0]->Platform->getNumDevices(), [context = this](DeviceId /* deviceId*/, v2::event_flags_t flags) -> std::unique_ptr { - assert((flags & v2::EVENT_FLAGS_COUNTER) != 0); - // TODO: just use per-context id? return std::make_unique( context, v2::QUEUE_IMMEDIATE, flags); }), + eventPoolCacheRegular(this, phDevices[0]->Platform->getNumDevices(), + [context = this, platform = phDevices[0]->Platform]( + DeviceId deviceId, v2::event_flags_t flags) + -> std::unique_ptr { + std::ignore = deviceId; + std::ignore = platform; + + // TODO: just use per-context id? + return std::make_unique( + context, v2::QUEUE_REGULAR, flags); + }), nativeEventsPool(this, std::make_unique( this, v2::QUEUE_IMMEDIATE, v2::EVENT_FLAGS_PROFILING_ENABLED)), diff --git a/unified-runtime/source/adapters/level_zero/v2/context.hpp b/unified-runtime/source/adapters/level_zero/v2/context.hpp index 03bc20aa46178..c8b907a4a2f35 100644 --- a/unified-runtime/source/adapters/level_zero/v2/context.hpp +++ b/unified-runtime/source/adapters/level_zero/v2/context.hpp @@ -17,6 +17,8 @@ #include "event_pool_cache.hpp" #include "usm.hpp" +enum class PoolCacheType { Immediate, Regular }; + struct ur_context_handle_t_ : ur_object { ur_context_handle_t_(ze_context_handle_t hContext, uint32_t numDevices, const ur_device_handle_t *phDevices, bool ownZeContext); @@ -34,9 +36,18 @@ struct ur_context_handle_t_ : ur_object { getP2PDevices(ur_device_handle_t hDevice) const; v2::event_pool &getNativeEventsPool() { return nativeEventsPool; } - v2::event_pool_cache &getEventPoolCache() { return eventPoolCache; } v2::command_list_cache_t &getCommandListCache() { return commandListCache; } - + v2::event_pool_cache &getEventPoolCache(PoolCacheType type) { + switch (type) { + case PoolCacheType::Immediate: + return eventPoolCacheImmediate; + case PoolCacheType::Regular: + return eventPoolCacheRegular; + default: + assert(false && "Requested invalid event pool cache type"); + throw UR_RESULT_ERROR_INVALID_VALUE; + } + } // Checks if Device is covered by this context. // For that the Device or its root devices need to be in the context. bool isValidDevice(ur_device_handle_t Device) const; @@ -45,7 +56,8 @@ struct ur_context_handle_t_ : ur_object { const v2::raii::ze_context_handle_t hContext; const std::vector hDevices; v2::command_list_cache_t commandListCache; - v2::event_pool_cache eventPoolCache; + v2::event_pool_cache eventPoolCacheImmediate; + v2::event_pool_cache eventPoolCacheRegular; // pool used for urEventCreateWithNativeHandle when native handle is NULL // (uses non-counter based events to allow for signaling from host) diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp index 33c05a1402012..8608f948f2460 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_immediate_in_order.cpp @@ -76,7 +76,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, getZePriority(pProps ? pProps->flags : ur_queue_flags_t{}), getZeIndex(pProps)), - eventFlagsFromQueueFlags(flags), this) {} + eventFlagsFromQueueFlags(flags), this, PoolCacheType::Immediate) {} ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( ur_context_handle_t hContext, ur_device_handle_t hDevice, @@ -93,7 +93,7 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t( } } }), - eventFlagsFromQueueFlags(flags), this) {} + eventFlagsFromQueueFlags(flags), this, PoolCacheType::Immediate) {} ze_event_handle_t ur_queue_immediate_in_order_t::getSignalEvent( locked &commandList, ur_event_handle_t *hUserEvent, @@ -605,7 +605,8 @@ ur_queue_immediate_in_order_t::enqueueUSMAdvise(const void *pMem, size_t size, TRACK_SCOPE_LATENCY("ur_queue_immediate_in_order_t::enqueueUSMAdvise"); auto commandListLocked = commandListManager.lock(); - UR_CALL(commandListLocked->appendUSMAdvise(pMem, size, advice, phEvent)); + UR_CALL(commandListLocked->appendUSMAdvise(pMem, size, advice, 0, nullptr, + phEvent)); return UR_RESULT_SUCCESS; }