Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions source/adapters/opencl/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME} SHARED
${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
${CMAKE_CURRENT_SOURCE_DIR}/sampler.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm.hpp
${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
${CMAKE_CURRENT_SOURCE_DIR}/usm_p2p.cpp
${CMAKE_CURRENT_SOURCE_DIR}/virtual_mem.cpp
Expand Down
179 changes: 140 additions & 39 deletions source/adapters/opencl/usm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
#include <ur/ur.hpp>

#include "common.hpp"
#include "usm.hpp"

template <class T>
void AllocDeleterCallback(cl_event event, cl_int, void *pUserData) {
clReleaseEvent(event);
auto Info = static_cast<T *>(pUserData);
delete Info;
}

namespace umf {
ur_result_t getProviderNativeError(const char *, int32_t) {
Expand Down Expand Up @@ -312,32 +320,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
numEventsInWaitList, cl_adapter::cast<const cl_event *>(phEventWaitList),
&CopyEvent));

struct DeleteCallbackInfo {
DeleteCallbackInfo(clMemBlockingFreeINTEL_fn USMFree, cl_context CLContext,
void *HostBuffer)
: USMFree(USMFree), CLContext(CLContext), HostBuffer(HostBuffer) {
clRetainContext(CLContext);
}
~DeleteCallbackInfo() {
USMFree(CLContext, HostBuffer);
clReleaseContext(CLContext);
}
DeleteCallbackInfo(const DeleteCallbackInfo &) = delete;
DeleteCallbackInfo &operator=(const DeleteCallbackInfo &) = delete;

clMemBlockingFreeINTEL_fn USMFree;
cl_context CLContext;
void *HostBuffer;
};

auto Info = new DeleteCallbackInfo(USMFree, CLContext, HostBuffer);
if (phEvent) {
// Since we're releasing this in the callback above we need to retain it
// here to keep the user copy alive.
CL_RETURN_ON_FAILURE(clRetainEvent(CopyEvent));
*phEvent = cl_adapter::cast<ur_event_handle_t>(CopyEvent);
}

auto DeleteCallback = [](cl_event, cl_int, void *pUserData) {
auto Info = static_cast<DeleteCallbackInfo *>(pUserData);
delete Info;
};
// This self destructs taking the event and allocation with it.
auto Info = new AllocDeleterCallbackInfo(USMFree, CLContext, HostBuffer);

ClErr = clSetEventCallback(CopyEvent, CL_COMPLETE, DeleteCallback, Info);
ClErr =
clSetEventCallback(CopyEvent, CL_COMPLETE,
AllocDeleterCallback<AllocDeleterCallbackInfo>, Info);
if (ClErr != CL_SUCCESS) {
// We can attempt to recover gracefully by attempting to wait for the copy
// to finish and deleting the info struct here.
Expand All @@ -346,11 +341,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMFill(
clReleaseEvent(CopyEvent);
CL_RETURN_ON_FAILURE(ClErr);
}
if (phEvent) {
*phEvent = cl_adapter::cast<ur_event_handle_t>(CopyEvent);
} else {
CL_RETURN_ON_FAILURE(clReleaseEvent(CopyEvent));
}

return UR_RESULT_SUCCESS;
}
Expand All @@ -369,20 +359,131 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy(
return mapCLErrorToUR(CLErr);
}

clEnqueueMemcpyINTEL_fn FuncPtr = nullptr;
ur_result_t RetVal = cl_ext::getExtFuncFromContext<clEnqueueMemcpyINTEL_fn>(
clGetMemAllocInfoINTEL_fn GetMemAllocInfo = nullptr;
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clGetMemAllocInfoINTEL_fn>(
CLContext, cl_ext::ExtFuncPtrCache->clGetMemAllocInfoINTELCache,
cl_ext::GetMemAllocInfoName, &GetMemAllocInfo));

clEnqueueMemcpyINTEL_fn USMMemcpy = nullptr;
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clEnqueueMemcpyINTEL_fn>(
CLContext, cl_ext::ExtFuncPtrCache->clEnqueueMemcpyINTELCache,
cl_ext::EnqueueMemcpyName, &FuncPtr);
cl_ext::EnqueueMemcpyName, &USMMemcpy));

if (FuncPtr) {
RetVal = mapCLErrorToUR(
FuncPtr(cl_adapter::cast<cl_command_queue>(hQueue), blocking, pDst,
pSrc, size, numEventsInWaitList,
cl_adapter::cast<const cl_event *>(phEventWaitList),
cl_adapter::cast<cl_event *>(phEvent)));
clMemBlockingFreeINTEL_fn USMFree = nullptr;
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clMemBlockingFreeINTEL_fn>(
CLContext, cl_ext::ExtFuncPtrCache->clMemBlockingFreeINTELCache,
cl_ext::MemBlockingFreeName, &USMFree));

// Check if the two allocations are DEVICE allocations from different
// devices, if they are we need to do the copy indirectly via a host
// allocation.
cl_device_id SrcDevice = 0, DstDevice = 0;
CL_RETURN_ON_FAILURE(
GetMemAllocInfo(CLContext, pSrc, CL_MEM_ALLOC_DEVICE_INTEL,
sizeof(cl_device_id), &SrcDevice, nullptr));
CL_RETURN_ON_FAILURE(
GetMemAllocInfo(CLContext, pDst, CL_MEM_ALLOC_DEVICE_INTEL,
sizeof(cl_device_id), &DstDevice, nullptr));

if ((SrcDevice && DstDevice) && SrcDevice != DstDevice) {
// We need a queue associated with each device, so first figure out which
// one we weren't given.
cl_device_id QueueDevice = nullptr;
CL_RETURN_ON_FAILURE(clGetCommandQueueInfo(
cl_adapter::cast<cl_command_queue>(hQueue), CL_QUEUE_DEVICE,
sizeof(QueueDevice), &QueueDevice, nullptr));

cl_command_queue MissingQueue = nullptr, SrcQueue = nullptr,
DstQueue = nullptr;
if (QueueDevice == SrcDevice) {
MissingQueue = clCreateCommandQueue(CLContext, DstDevice, 0, &CLErr);
SrcQueue = cl_adapter::cast<cl_command_queue>(hQueue);
DstQueue = MissingQueue;
} else {
MissingQueue = clCreateCommandQueue(CLContext, SrcDevice, 0, &CLErr);
DstQueue = cl_adapter::cast<cl_command_queue>(hQueue);
SrcQueue = MissingQueue;
}
CL_RETURN_ON_FAILURE(CLErr);

cl_event HostCopyEvent = nullptr, FinalCopyEvent = nullptr;
clHostMemAllocINTEL_fn HostMemAlloc = nullptr;
UR_RETURN_ON_FAILURE(cl_ext::getExtFuncFromContext<clHostMemAllocINTEL_fn>(
CLContext, cl_ext::ExtFuncPtrCache->clHostMemAllocINTELCache,
cl_ext::HostMemAllocName, &HostMemAlloc));

auto HostAlloc = HostMemAlloc(CLContext, nullptr, size, 0, &CLErr);
CL_RETURN_ON_FAILURE(CLErr);

// Now that we've successfully allocated we should try to clean it up if we
// hit an error somewhere.
auto checkCLErr = [&](cl_int CLErr) -> ur_result_t {
if (CLErr != CL_SUCCESS) {
if (HostCopyEvent) {
clReleaseEvent(HostCopyEvent);
}
if (FinalCopyEvent) {
clReleaseEvent(FinalCopyEvent);
}
USMFree(CLContext, HostAlloc);
CL_RETURN_ON_FAILURE(CLErr);
}
return UR_RESULT_SUCCESS;
};

UR_RETURN_ON_FAILURE(checkCLErr(USMMemcpy(
SrcQueue, blocking, HostAlloc, pSrc, size, numEventsInWaitList,
cl_adapter::cast<const cl_event *>(phEventWaitList), &HostCopyEvent)));

UR_RETURN_ON_FAILURE(
checkCLErr(USMMemcpy(DstQueue, blocking, pDst, HostAlloc, size, 1,
&HostCopyEvent, &FinalCopyEvent)));

// If this is a blocking operation we can do our cleanup immediately,
// otherwise we need to defer it to an event callback.
if (blocking) {
CL_RETURN_ON_FAILURE(USMFree(CLContext, HostAlloc));
CL_RETURN_ON_FAILURE(clReleaseEvent(HostCopyEvent));
CL_RETURN_ON_FAILURE(clReleaseCommandQueue(MissingQueue));
if (phEvent) {
*phEvent = cl_adapter::cast<ur_event_handle_t>(FinalCopyEvent);
} else {
CL_RETURN_ON_FAILURE(clReleaseEvent(FinalCopyEvent));
}
} else {
if (phEvent) {
*phEvent = cl_adapter::cast<ur_event_handle_t>(FinalCopyEvent);
// We are going to release this event in our callback so we need to
// retain if the user wants a copy.
CL_RETURN_ON_FAILURE(clRetainEvent(FinalCopyEvent));
}

// This self destructs taking the event and allocation with it.
auto DeleterInfo = new AllocDeleterCallbackInfoWithQueue(
USMFree, CLContext, HostAlloc, MissingQueue);

CLErr = clSetEventCallback(
HostCopyEvent, CL_COMPLETE,
AllocDeleterCallback<AllocDeleterCallbackInfoWithQueue>, DeleterInfo);

if (CLErr != CL_SUCCESS) {
// We can attempt to recover gracefully by attempting to wait for the
// copy to finish and deleting the info struct here.
clWaitForEvents(1, &HostCopyEvent);
delete DeleterInfo;
clReleaseEvent(HostCopyEvent);
CL_RETURN_ON_FAILURE(CLErr);
}
}
} else {
CL_RETURN_ON_FAILURE(
USMMemcpy(cl_adapter::cast<cl_command_queue>(hQueue), blocking, pDst,
pSrc, size, numEventsInWaitList,
cl_adapter::cast<const cl_event *>(phEventWaitList),
cl_adapter::cast<cl_event *>(phEvent)));
}

return RetVal;
return UR_RESULT_SUCCESS;
}

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMPrefetch(
Expand Down
60 changes: 60 additions & 0 deletions source/adapters/opencl/usm.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
//===--------------------- usm.hpp - OpenCL Adapter -----------------------===//
//
// Copyright (C) 2024 Intel Corporation
//
// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
// Exceptions. See LICENSE.TXT
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "CL/cl_ext.h"
#include <CL/cl.h>

// This struct is intended to be used in conjunction with the below callback via
// clSetEventCallback to release temporary allocations created by the adapter to
// implement certain USM operations.
//
// Example usage:
//
// auto Info = new AllocDeleterCallbackInfo(USMFreeFuncPtr, Context,
// Allocation); clSetEventCallback(USMOpEvent, CL_COMPLETE,
// AllocDeleterCallback, Info);
struct AllocDeleterCallbackInfo {
AllocDeleterCallbackInfo(clMemBlockingFreeINTEL_fn USMFree,
cl_context CLContext, void *Allocation)
: USMFree(USMFree), CLContext(CLContext), Allocation(Allocation) {
clRetainContext(CLContext);
}
~AllocDeleterCallbackInfo() {
USMFree(CLContext, Allocation);
clReleaseContext(CLContext);
}
AllocDeleterCallbackInfo(const AllocDeleterCallbackInfo &) = delete;
AllocDeleterCallbackInfo &
operator=(const AllocDeleterCallbackInfo &) = delete;

clMemBlockingFreeINTEL_fn USMFree;
cl_context CLContext;
void *Allocation;
};

struct AllocDeleterCallbackInfoWithQueue : AllocDeleterCallbackInfo {
AllocDeleterCallbackInfoWithQueue(clMemBlockingFreeINTEL_fn USMFree,
cl_context CLContext, void *Allocation,
cl_command_queue CLQueue)
: AllocDeleterCallbackInfo(USMFree, CLContext, Allocation),
CLQueue(CLQueue) {
clRetainContext(CLContext);
}
~AllocDeleterCallbackInfoWithQueue() { clReleaseCommandQueue(CLQueue); }
AllocDeleterCallbackInfoWithQueue(const AllocDeleterCallbackInfoWithQueue &) =
delete;
AllocDeleterCallbackInfoWithQueue &
operator=(const AllocDeleterCallbackInfoWithQueue &) = delete;

cl_command_queue CLQueue;
};

template <class T>
void AllocDeleterCallback(cl_event event, cl_int, void *pUserData);
98 changes: 98 additions & 0 deletions test/conformance/enqueue/urEnqueueUSMMemcpy.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -167,3 +167,101 @@ TEST_P(urEnqueueUSMMemcpyTest, InvalidNullPtrEventWaitList) {
}

UUR_INSTANTIATE_DEVICE_TEST_SUITE_P(urEnqueueUSMMemcpyTest);

struct urEnqueueUSMMemcpyMultiDeviceTest : uur::urAllDevicesTest {
void SetUp() override {
uur::urAllDevicesTest::SetUp();
for (auto &device : devices) {
ur_device_usm_access_capability_flags_t device_usm = 0;
ASSERT_SUCCESS(uur::GetDeviceUSMDeviceSupport(device, device_usm));
if (device_usm) {
usm_devices.push_back(device);
if (usm_devices.size() == 2) {
break;
}
}
}

if (usm_devices.size() < 2) {
GTEST_SKIP() << "Not enough devices in platform with USM support";
}

ASSERT_SUCCESS(urContextCreate(usm_devices.size(), usm_devices.data(),
nullptr, &context));
ASSERT_SUCCESS(
urQueueCreate(context, usm_devices[0], nullptr, &src_queue));
ASSERT_SUCCESS(
urQueueCreate(context, usm_devices[1], nullptr, &dst_queue));

ASSERT_SUCCESS(
urUSMHostAlloc(context, nullptr, nullptr, alloc_size, &host_alloc));
ASSERT_SUCCESS(urUSMDeviceAlloc(context, usm_devices[0], nullptr,
nullptr, alloc_size, &src_alloc));
ASSERT_SUCCESS(urUSMDeviceAlloc(context, usm_devices[1], nullptr,
nullptr, alloc_size, &dst_alloc));

ASSERT_SUCCESS(urEnqueueUSMFill(src_queue, src_alloc,
sizeof(fill_pattern), &fill_pattern,
alloc_size, 0, nullptr, nullptr));
ASSERT_SUCCESS(urQueueFinish(src_queue));
}

void TearDown() override {
if (src_alloc) {
ASSERT_SUCCESS(urUSMFree(context, src_alloc));
}
if (dst_alloc) {
ASSERT_SUCCESS(urUSMFree(context, dst_alloc));
}
if (host_alloc) {
ASSERT_SUCCESS(urUSMFree(context, host_alloc));
}
if (src_queue) {
ASSERT_SUCCESS(urQueueRelease(src_queue));
}
if (dst_queue) {
ASSERT_SUCCESS(urQueueRelease(dst_queue));
}
if (context) {
ASSERT_SUCCESS(urContextRelease(context));
}
uur::urAllDevicesTest::TearDown();
}

void verifyData() {
for (size_t i = 0; i < alloc_size; i++) {
EXPECT_EQ(static_cast<uint8_t *>(host_alloc)[i], fill_pattern);
}
}

std::vector<ur_device_handle_t> usm_devices;
ur_context_handle_t context = nullptr;
ur_queue_handle_t src_queue = nullptr;
ur_queue_handle_t dst_queue = nullptr;
void *src_alloc = nullptr;
void *dst_alloc = nullptr;
void *host_alloc = nullptr;
size_t alloc_size = 64;
uint8_t fill_pattern = 42;
};

TEST_F(urEnqueueUSMMemcpyMultiDeviceTest, DeviceToDeviceCopyBlocking) {
ASSERT_SUCCESS(urEnqueueUSMMemcpy(src_queue, true, dst_alloc, src_alloc,
alloc_size, 0, nullptr, nullptr));
ASSERT_SUCCESS(urEnqueueUSMMemcpy(dst_queue, true, host_alloc, dst_alloc,
alloc_size, 0, nullptr, nullptr));
verifyData();
}

TEST_F(urEnqueueUSMMemcpyMultiDeviceTest, DeviceToDeviceCopyNonBlocking) {
ur_event_handle_t device_copy_event = nullptr;
ASSERT_SUCCESS(urEnqueueUSMMemcpy(src_queue, false, dst_alloc, src_alloc,
alloc_size, 0, nullptr,
&device_copy_event));
ASSERT_SUCCESS(urQueueFlush(src_queue));
ASSERT_SUCCESS(urEventWait(1, &device_copy_event));
ASSERT_SUCCESS(urEventRelease(device_copy_event));
ASSERT_SUCCESS(urEnqueueUSMMemcpy(dst_queue, true, host_alloc, dst_alloc,
alloc_size, 0, nullptr, nullptr));
verifyData();
}
Loading