From 78c33ce5e84804a4522d36fb48672eb30bdba4a9 Mon Sep 17 00:00:00 2001 From: Aaron Greig Date: Mon, 19 Aug 2024 10:12:00 +0100 Subject: [PATCH] Implement workaround to allow device -> device USM memcpy on CL. --- source/adapters/opencl/CMakeLists.txt | 1 + source/adapters/opencl/usm.cpp | 179 ++++++++++++++---- source/adapters/opencl/usm.hpp | 60 ++++++ .../enqueue/urEnqueueUSMMemcpy.cpp | 98 ++++++++++ 4 files changed, 299 insertions(+), 39 deletions(-) create mode 100644 source/adapters/opencl/usm.hpp diff --git a/source/adapters/opencl/CMakeLists.txt b/source/adapters/opencl/CMakeLists.txt index 8bf67239b0..a7e91f75e5 100644 --- a/source/adapters/opencl/CMakeLists.txt +++ b/source/adapters/opencl/CMakeLists.txt @@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME} SHARED ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm.hpp ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp ${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp ${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp diff --git a/source/adapters/opencl/usm.cpp b/source/adapters/opencl/usm.cpp index 03ee4a18f3..61a6e7aaf7 100644 --- a/source/adapters/opencl/usm.cpp +++ b/source/adapters/opencl/usm.cpp @@ -11,6 +11,14 @@ #include #include "common.hpp" +#include "usm.hpp" + +template +void AllocDeleterCallback(cl_event event, cl_int, void *pUserData) { + clReleaseEvent(event); + auto Info = static_cast(pUserData); + delete Info; +} namespace umf { ur_result_t getProviderNativeError(const char *, int32_t) { @@ -312,32 +320,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( numEventsInWaitList, cl_adapter::cast(phEventWaitList), &CopyEvent)); - struct DeleteCallbackInfo { - DeleteCallbackInfo(clMemBlockingFreeINTEL_fn USMFree, cl_context CLContext, - void *HostBuffer) - : USMFree(USMFree), CLContext(CLContext), HostBuffer(HostBuffer) { - clRetainContext(CLContext); - } - ~DeleteCallbackInfo() { - USMFree(CLContext, HostBuffer); - clReleaseContext(CLContext); - } - DeleteCallbackInfo(const DeleteCallbackInfo &) = delete; - DeleteCallbackInfo &operator=(const DeleteCallbackInfo &) = delete; - - clMemBlockingFreeINTEL_fn USMFree; - cl_context CLContext; - void *HostBuffer; - }; - - auto Info = new DeleteCallbackInfo(USMFree, CLContext, HostBuffer); + if (phEvent) { + // Since we're releasing this in the callback above we need to retain it + // here to keep the user copy alive. + CL_RETURN_ON_FAILURE(clRetainEvent(CopyEvent)); + *phEvent = cl_adapter::cast(CopyEvent); + } - auto DeleteCallback = [](cl_event, cl_int, void *pUserData) { - auto Info = static_cast(pUserData); - delete Info; - }; + // This self destructs taking the event and allocation with it. + auto Info = new AllocDeleterCallbackInfo(USMFree, CLContext, HostBuffer); - ClErr = clSetEventCallback(CopyEvent, CL_COMPLETE, DeleteCallback, Info); + ClErr = + clSetEventCallback(CopyEvent, CL_COMPLETE, + AllocDeleterCallback, Info); if (ClErr != CL_SUCCESS) { // We can attempt to recover gracefully by attempting to wait for the copy // to finish and deleting the info struct here. @@ -346,11 +341,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill( clReleaseEvent(CopyEvent); CL_RETURN_ON_FAILURE(ClErr); } - if (phEvent) { - *phEvent = cl_adapter::cast(CopyEvent); - } else { - CL_RETURN_ON_FAILURE(clReleaseEvent(CopyEvent)); - } return UR_RESULT_SUCCESS; } @@ -369,20 +359,131 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( return mapCLErrorToUR(CLErr); } - clEnqueueMemcpyINTEL_fn FuncPtr = nullptr; - ur_result_t RetVal = cl_ext::getExtFuncFromContext( + clGetMemAllocInfoINTEL_fn GetMemAllocInfo = nullptr; + UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext( + CLContext, cl_ext::ExtFuncPtrCache->clGetMemAllocInfoINTELCache, + cl_ext::GetMemAllocInfoName, &GetMemAllocInfo)); + + clEnqueueMemcpyINTEL_fn USMMemcpy = nullptr; + UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext( CLContext, cl_ext::ExtFuncPtrCache->clEnqueueMemcpyINTELCache, - cl_ext::EnqueueMemcpyName, &FuncPtr); + cl_ext::EnqueueMemcpyName, &USMMemcpy)); - if (FuncPtr) { - RetVal = mapCLErrorToUR( - FuncPtr(cl_adapter::cast(hQueue), blocking, pDst, - pSrc, size, numEventsInWaitList, - cl_adapter::cast(phEventWaitList), - cl_adapter::cast(phEvent))); + clMemBlockingFreeINTEL_fn USMFree = nullptr; + UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext( + CLContext, cl_ext::ExtFuncPtrCache->clMemBlockingFreeINTELCache, + cl_ext::MemBlockingFreeName, &USMFree)); + + // Check if the two allocations are DEVICE allocations from different + // devices, if they are we need to do the copy indirectly via a host + // allocation. + cl_device_id SrcDevice = 0, DstDevice = 0; + CL_RETURN_ON_FAILURE( + GetMemAllocInfo(CLContext, pSrc, CL_MEM_ALLOC_DEVICE_INTEL, + sizeof(cl_device_id), &SrcDevice, nullptr)); + CL_RETURN_ON_FAILURE( + GetMemAllocInfo(CLContext, pDst, CL_MEM_ALLOC_DEVICE_INTEL, + sizeof(cl_device_id), &DstDevice, nullptr)); + + if ((SrcDevice && DstDevice) && SrcDevice != DstDevice) { + // We need a queue associated with each device, so first figure out which + // one we weren't given. + cl_device_id QueueDevice = nullptr; + CL_RETURN_ON_FAILURE(clGetCommandQueueInfo( + cl_adapter::cast(hQueue), CL_QUEUE_DEVICE, + sizeof(QueueDevice), &QueueDevice, nullptr)); + + cl_command_queue MissingQueue = nullptr, SrcQueue = nullptr, + DstQueue = nullptr; + if (QueueDevice == SrcDevice) { + MissingQueue = clCreateCommandQueue(CLContext, DstDevice, 0, &CLErr); + SrcQueue = cl_adapter::cast(hQueue); + DstQueue = MissingQueue; + } else { + MissingQueue = clCreateCommandQueue(CLContext, SrcDevice, 0, &CLErr); + DstQueue = cl_adapter::cast(hQueue); + SrcQueue = MissingQueue; + } + CL_RETURN_ON_FAILURE(CLErr); + + cl_event HostCopyEvent = nullptr, FinalCopyEvent = nullptr; + clHostMemAllocINTEL_fn HostMemAlloc = nullptr; + UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext( + CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache, + cl_ext::HostMemAllocName, &HostMemAlloc)); + + auto HostAlloc = HostMemAlloc(CLContext, nullptr, size, 0, &CLErr); + CL_RETURN_ON_FAILURE(CLErr); + + // Now that we've successfully allocated we should try to clean it up if we + // hit an error somewhere. + auto checkCLErr = [&](cl_int CLErr) -> ur_result_t { + if (CLErr != CL_SUCCESS) { + if (HostCopyEvent) { + clReleaseEvent(HostCopyEvent); + } + if (FinalCopyEvent) { + clReleaseEvent(FinalCopyEvent); + } + USMFree(CLContext, HostAlloc); + CL_RETURN_ON_FAILURE(CLErr); + } + return UR_RESULT_SUCCESS; + }; + + UR_RETURN_ON_FAILURE(checkCLErr(USMMemcpy( + SrcQueue, blocking, HostAlloc, pSrc, size, numEventsInWaitList, + cl_adapter::cast(phEventWaitList), &HostCopyEvent))); + + UR_RETURN_ON_FAILURE( + checkCLErr(USMMemcpy(DstQueue, blocking, pDst, HostAlloc, size, 1, + &HostCopyEvent, &FinalCopyEvent))); + + // If this is a blocking operation we can do our cleanup immediately, + // otherwise we need to defer it to an event callback. + if (blocking) { + CL_RETURN_ON_FAILURE(USMFree(CLContext, HostAlloc)); + CL_RETURN_ON_FAILURE(clReleaseEvent(HostCopyEvent)); + CL_RETURN_ON_FAILURE(clReleaseCommandQueue(MissingQueue)); + if (phEvent) { + *phEvent = cl_adapter::cast(FinalCopyEvent); + } else { + CL_RETURN_ON_FAILURE(clReleaseEvent(FinalCopyEvent)); + } + } else { + if (phEvent) { + *phEvent = cl_adapter::cast(FinalCopyEvent); + // We are going to release this event in our callback so we need to + // retain if the user wants a copy. + CL_RETURN_ON_FAILURE(clRetainEvent(FinalCopyEvent)); + } + + // This self destructs taking the event and allocation with it. + auto DeleterInfo = new AllocDeleterCallbackInfoWithQueue( + USMFree, CLContext, HostAlloc, MissingQueue); + + CLErr = clSetEventCallback( + HostCopyEvent, CL_COMPLETE, + AllocDeleterCallback, DeleterInfo); + + if (CLErr != CL_SUCCESS) { + // We can attempt to recover gracefully by attempting to wait for the + // copy to finish and deleting the info struct here. + clWaitForEvents(1, &HostCopyEvent); + delete DeleterInfo; + clReleaseEvent(HostCopyEvent); + CL_RETURN_ON_FAILURE(CLErr); + } + } + } else { + CL_RETURN_ON_FAILURE( + USMMemcpy(cl_adapter::cast(hQueue), blocking, pDst, + pSrc, size, numEventsInWaitList, + cl_adapter::cast(phEventWaitList), + cl_adapter::cast(phEvent))); } - return RetVal; + return UR_RESULT_SUCCESS; } UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch( diff --git a/source/adapters/opencl/usm.hpp b/source/adapters/opencl/usm.hpp new file mode 100644 index 0000000000..5cdb5daea3 --- /dev/null +++ b/source/adapters/opencl/usm.hpp @@ -0,0 +1,60 @@ +//===--------------------- usm.hpp - OpenCL Adapter -----------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "CL/cl_ext.h" +#include + +// This struct is intended to be used in conjunction with the below callback via +// clSetEventCallback to release temporary allocations created by the adapter to +// implement certain USM operations. +// +// Example usage: +// +// auto Info = new AllocDeleterCallbackInfo(USMFreeFuncPtr, Context, +// Allocation); clSetEventCallback(USMOpEvent, CL_COMPLETE, +// AllocDeleterCallback, Info); +struct AllocDeleterCallbackInfo { + AllocDeleterCallbackInfo(clMemBlockingFreeINTEL_fn USMFree, + cl_context CLContext, void *Allocation) + : USMFree(USMFree), CLContext(CLContext), Allocation(Allocation) { + clRetainContext(CLContext); + } + ~AllocDeleterCallbackInfo() { + USMFree(CLContext, Allocation); + clReleaseContext(CLContext); + } + AllocDeleterCallbackInfo(const AllocDeleterCallbackInfo &) = delete; + AllocDeleterCallbackInfo & + operator=(const AllocDeleterCallbackInfo &) = delete; + + clMemBlockingFreeINTEL_fn USMFree; + cl_context CLContext; + void *Allocation; +}; + +struct AllocDeleterCallbackInfoWithQueue : AllocDeleterCallbackInfo { + AllocDeleterCallbackInfoWithQueue(clMemBlockingFreeINTEL_fn USMFree, + cl_context CLContext, void *Allocation, + cl_command_queue CLQueue) + : AllocDeleterCallbackInfo(USMFree, CLContext, Allocation), + CLQueue(CLQueue) { + clRetainContext(CLContext); + } + ~AllocDeleterCallbackInfoWithQueue() { clReleaseCommandQueue(CLQueue); } + AllocDeleterCallbackInfoWithQueue(const AllocDeleterCallbackInfoWithQueue &) = + delete; + AllocDeleterCallbackInfoWithQueue & + operator=(const AllocDeleterCallbackInfoWithQueue &) = delete; + + cl_command_queue CLQueue; +}; + +template +void AllocDeleterCallback(cl_event event, cl_int, void *pUserData); diff --git a/test/conformance/enqueue/urEnqueueUSMMemcpy.cpp b/test/conformance/enqueue/urEnqueueUSMMemcpy.cpp index 6cd16546e9..9189eaea4d 100644 --- a/test/conformance/enqueue/urEnqueueUSMMemcpy.cpp +++ b/test/conformance/enqueue/urEnqueueUSMMemcpy.cpp @@ -167,3 +167,101 @@ TEST_P(urEnqueueUSMMemcpyTest, InvalidNullPtrEventWaitList) { } UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueUSMMemcpyTest); + +struct urEnqueueUSMMemcpyMultiDeviceTest : uur::urAllDevicesTest { + void SetUp() override { + uur::urAllDevicesTest::SetUp(); + for (auto &device : devices) { + ur_device_usm_access_capability_flags_t device_usm = 0; + ASSERT_SUCCESS(uur::GetDeviceUSMDeviceSupport(device, device_usm)); + if (device_usm) { + usm_devices.push_back(device); + if (usm_devices.size() == 2) { + break; + } + } + } + + if (usm_devices.size() < 2) { + GTEST_SKIP() << "Not enough devices in platform with USM support"; + } + + ASSERT_SUCCESS(urContextCreate(usm_devices.size(), usm_devices.data(), + nullptr, &context)); + ASSERT_SUCCESS( + urQueueCreate(context, usm_devices[0], nullptr, &src_queue)); + ASSERT_SUCCESS( + urQueueCreate(context, usm_devices[1], nullptr, &dst_queue)); + + ASSERT_SUCCESS( + urUSMHostAlloc(context, nullptr, nullptr, alloc_size, &host_alloc)); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, usm_devices[0], nullptr, + nullptr, alloc_size, &src_alloc)); + ASSERT_SUCCESS(urUSMDeviceAlloc(context, usm_devices[1], nullptr, + nullptr, alloc_size, &dst_alloc)); + + ASSERT_SUCCESS(urEnqueueUSMFill(src_queue, src_alloc, + sizeof(fill_pattern), &fill_pattern, + alloc_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urQueueFinish(src_queue)); + } + + void TearDown() override { + if (src_alloc) { + ASSERT_SUCCESS(urUSMFree(context, src_alloc)); + } + if (dst_alloc) { + ASSERT_SUCCESS(urUSMFree(context, dst_alloc)); + } + if (host_alloc) { + ASSERT_SUCCESS(urUSMFree(context, host_alloc)); + } + if (src_queue) { + ASSERT_SUCCESS(urQueueRelease(src_queue)); + } + if (dst_queue) { + ASSERT_SUCCESS(urQueueRelease(dst_queue)); + } + if (context) { + ASSERT_SUCCESS(urContextRelease(context)); + } + uur::urAllDevicesTest::TearDown(); + } + + void verifyData() { + for (size_t i = 0; i < alloc_size; i++) { + EXPECT_EQ(static_cast(host_alloc)[i], fill_pattern); + } + } + + std::vector usm_devices; + ur_context_handle_t context = nullptr; + ur_queue_handle_t src_queue = nullptr; + ur_queue_handle_t dst_queue = nullptr; + void *src_alloc = nullptr; + void *dst_alloc = nullptr; + void *host_alloc = nullptr; + size_t alloc_size = 64; + uint8_t fill_pattern = 42; +}; + +TEST_F(urEnqueueUSMMemcpyMultiDeviceTest, DeviceToDeviceCopyBlocking) { + ASSERT_SUCCESS(urEnqueueUSMMemcpy(src_queue, true, dst_alloc, src_alloc, + alloc_size, 0, nullptr, nullptr)); + ASSERT_SUCCESS(urEnqueueUSMMemcpy(dst_queue, true, host_alloc, dst_alloc, + alloc_size, 0, nullptr, nullptr)); + verifyData(); +} + +TEST_F(urEnqueueUSMMemcpyMultiDeviceTest, DeviceToDeviceCopyNonBlocking) { + ur_event_handle_t device_copy_event = nullptr; + ASSERT_SUCCESS(urEnqueueUSMMemcpy(src_queue, false, dst_alloc, src_alloc, + alloc_size, 0, nullptr, + &device_copy_event)); + ASSERT_SUCCESS(urQueueFlush(src_queue)); + ASSERT_SUCCESS(urEventWait(1, &device_copy_event)); + ASSERT_SUCCESS(urEventRelease(device_copy_event)); + ASSERT_SUCCESS(urEnqueueUSMMemcpy(dst_queue, true, host_alloc, dst_alloc, + alloc_size, 0, nullptr, nullptr)); + verifyData(); +}