From b5335882d8dc96eafa15da13e5cd8002fc7aa92d Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 6 Jun 2025 15:28:43 +0100 Subject: [PATCH 1/4] [UR][Offload] Add initial membuffer implementation --- .../source/adapters/offload/CMakeLists.txt | 1 + .../source/adapters/offload/context.hpp | 5 + .../source/adapters/offload/enqueue.cpp | 65 +++++++++++ .../source/adapters/offload/kernel.cpp | 21 ++++ .../source/adapters/offload/kernel.hpp | 20 ++++ .../source/adapters/offload/memory.cpp | 104 ++++++++++++++++++ .../source/adapters/offload/memory.hpp | 64 +++++++++++ .../source/adapters/offload/queue.cpp | 1 + .../source/adapters/offload/queue.hpp | 1 + .../adapters/offload/ur_interface_loader.cpp | 12 +- .../conformance/memory/urMemBufferCreate.cpp | 4 +- 11 files changed, 291 insertions(+), 7 deletions(-) create mode 100644 unified-runtime/source/adapters/offload/memory.cpp create mode 100644 unified-runtime/source/adapters/offload/memory.hpp diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt index 6c6a50625adc4..6f202f8b881e0 100644 --- a/unified-runtime/source/adapters/offload/CMakeLists.txt +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -37,6 +37,7 @@ add_ur_adapter(${TARGET_NAME} ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/memory.cpp ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp index 64727ce3338bb..69295c0499479 100644 --- a/unified-runtime/source/adapters/offload/context.hpp +++ b/unified-runtime/source/adapters/offload/context.hpp @@ -10,7 +10,9 @@ #pragma once +#include "adapter.hpp" #include "common.hpp" +#include "device.hpp" #include #include #include @@ -18,9 +20,12 @@ struct ur_context_handle_t_ : RefCounted { ur_context_handle_t_(ur_device_handle_t hDevice) : Device{hDevice} { urDeviceRetain(Device); + // For convenience, store the host device in the context + OffloadHost = Adapter.HostDevice; } ~ur_context_handle_t_() { urDeviceRelease(Device); } ur_device_handle_t Device; std::unordered_map AllocTypeMap; + ol_device_handle_t OffloadHost; }; diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 0124b4f28e34a..fe6db0a9da83e 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -12,8 +12,10 @@ #include #include +#include "context.hpp" #include "event.hpp" #include "kernel.hpp" +#include "memory.hpp" #include "queue.hpp" #include "ur2offload.hpp" @@ -88,3 +90,66 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy2D( size_t, uint32_t, const ur_event_handle_t *, ur_event_handle_t *) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingRead, + size_t offset, size_t size, void *pDst, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + // Ignore wait list for now + (void)numEventsInWaitList; + (void)phEventWaitList; + // + + ol_event_handle_t EventOut = nullptr; + + void *DevPtr = std::get(hBuffer->Mem).Ptr; + + olMemcpy(hQueue->OffloadQueue, pDst, hQueue->Context->OffloadHost, + DevPtr + offset, hQueue->OffloadDevice, size, + phEvent ? &EventOut : nullptr); + + if (blockingRead) { + olWaitQueue(hQueue->OffloadQueue); + } + + if (phEvent) { + auto *Event = new ur_event_handle_t_(); + Event->OffloadEvent = EventOut; + *phEvent = Event; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( + ur_queue_handle_t hQueue, ur_mem_handle_t hBuffer, bool blockingWrite, + size_t offset, size_t size, const void *pSrc, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + + // Ignore wait list for now + (void)numEventsInWaitList; + (void)phEventWaitList; + // + + ol_event_handle_t EventOut = nullptr; + + void *DevPtr = std::get(hBuffer->Mem).Ptr; + + // TODO: olMemcpy src should be const + olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice, + const_cast(pSrc), hQueue->Context->OffloadHost, size, + phEvent ? &EventOut : nullptr); + + if (blockingWrite) { + olWaitQueue(hQueue->OffloadQueue); + } + + if (phEvent) { + auto *Event = new ur_event_handle_t_(); + Event->OffloadEvent = EventOut; + *phEvent = Event; + } + + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp index 12bfe0478130a..b9e9152d437a2 100644 --- a/unified-runtime/source/adapters/offload/kernel.cpp +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -9,6 +9,7 @@ //===----------------------------------------------------------------------===// #include "kernel.hpp" +#include "memory.hpp" #include "program.hpp" #include "ur2offload.hpp" #include @@ -83,6 +84,26 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetArgMemObj(ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_mem_obj_properties_t *Properties, + ur_mem_handle_t hArgValue) { + // Handle zero-sized buffers + if (hArgValue == nullptr) { + hKernel->Args.addArg(argIndex, 0, nullptr); + return UR_RESULT_SUCCESS; + } + + ur_mem_flags_t MemAccess = + Properties ? Properties->memoryAccess + : static_cast(UR_MEM_FLAG_READ_WRITE); + hKernel->Args.addMemObjArg(argIndex, hArgValue, MemAccess); + + auto Ptr = std::get(hArgValue->Mem).Ptr; + hKernel->Args.addArg(argIndex, sizeof(void *), &Ptr); + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( ur_kernel_handle_t, ur_device_handle_t, ur_kernel_group_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet) { diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp index dea7e25d9da9e..e8ff732d700f0 100644 --- a/unified-runtime/source/adapters/offload/kernel.hpp +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -32,6 +32,13 @@ struct ur_kernel_handle_t_ : RefCounted { args_size_t ParamSizes; args_ptr_t Pointers; + struct MemObjArg { + ur_mem_handle_t_ *Mem; + int Index; + ur_mem_flags_t AccessFlags; + }; + std::vector MemObjArgs; + // Add an argument. If it already exists, it is replaced. Gaps are filled // with empty arguments. void addArg(size_t Index, size_t Size, const void *Arg) { @@ -48,6 +55,19 @@ struct ur_kernel_handle_t_ : RefCounted { Pointers[Index] = &Storage[InsertPos]; } + void addMemObjArg(int Index, ur_mem_handle_t hMem, ur_mem_flags_t Flags) { + assert(hMem && "Invalid mem handle"); + // If a memobj is already set at this index, update the entry rather + // than adding a duplicate one + for (auto &Arg : MemObjArgs) { + if (Arg.Index == Index) { + Arg = MemObjArg{hMem, Index, Flags}; + return; + } + } + MemObjArgs.push_back(MemObjArg{hMem, Index, Flags}); + } + const args_ptr_t &getPointers() const noexcept { return Pointers; } const char *getStorage() const noexcept { return Storage.data(); } diff --git a/unified-runtime/source/adapters/offload/memory.cpp b/unified-runtime/source/adapters/offload/memory.cpp new file mode 100644 index 0000000000000..bad7e40f02c9d --- /dev/null +++ b/unified-runtime/source/adapters/offload/memory.cpp @@ -0,0 +1,104 @@ +//===----------- memory.cpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "context.hpp" +#include "device.hpp" +#include "memory.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( + ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, + const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { + + // TODO: We can avoid the initial copy with USE_HOST_POINTER by implementing + // something like olMemRegister + const bool PerformInitialCopy = + (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) || + (flags & UR_MEM_FLAG_USE_HOST_POINTER); + + void *Ptr = nullptr; + auto HostPtr = pProperties ? pProperties->pHost : nullptr; + auto OffloadDevice = hContext->Device->OffloadDevice; + auto AllocMode = BufferMem::AllocMode::Default; + + if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { + olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_HOST, size, &HostPtr); + // TODO: We (probably) need something like cuMemHostGetDevicePointer + // for this to work everywhere. For now assume the managed host pointer is + // device-accessible. + Ptr = HostPtr; + AllocMode = BufferMem::AllocMode::AllocHostPtr; + } else { + olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_DEVICE, size, &Ptr); + if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { + AllocMode = BufferMem::AllocMode::CopyIn; + } + } + + ur_mem_handle_t ParentBuffer = nullptr; + auto URMemObj = std::unique_ptr(new ur_mem_handle_t_{ + hContext, ParentBuffer, flags, AllocMode, Ptr, HostPtr, size}); + + if (PerformInitialCopy) { + olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr, hContext->OffloadHost, size, + nullptr); + } + + *phBuffer = URMemObj.release(); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRetain(ur_mem_handle_t hMem) { + hMem->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { + if (--hMem->RefCount > 0) { + return UR_RESULT_SUCCESS; + } + + std::unique_ptr MemObjPtr(hMem); + if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { + // TODO: Handle registered host memory + auto &BufferImpl = std::get(MemObjPtr->Mem); + olMemFree(BufferImpl.Ptr); + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urMemGetInfo(ur_mem_handle_t hMemory, + ur_mem_info_t MemInfoType, + size_t propSize, + void *pMemInfo, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pMemInfo, pPropSizeRet); + + switch (MemInfoType) { + case UR_MEM_INFO_SIZE: { + return ReturnValue(std::get(hMemory->Mem).Size); + } + case UR_MEM_INFO_CONTEXT: { + return ReturnValue(hMemory->getContext()); + } + case UR_MEM_INFO_REFERENCE_COUNT: { + return ReturnValue(hMemory->RefCount.load()); + } + + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } +} diff --git a/unified-runtime/source/adapters/offload/memory.hpp b/unified-runtime/source/adapters/offload/memory.hpp new file mode 100644 index 0000000000000..48ea5d3a1f06b --- /dev/null +++ b/unified-runtime/source/adapters/offload/memory.hpp @@ -0,0 +1,64 @@ +//===----------- memory.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "ur_api.h" + +#include "common.hpp" + +struct BufferMem { + enum class AllocMode { + Default, + UseHostPtr, + CopyIn, + AllocHostPtr, + }; + + ur_mem_handle_t Parent; + // Underlying device pointer + void *Ptr; + // Pointer associated with this device on the host + void *HostPtr; + size_t Size; + + AllocMode MemAllocMode; + + BufferMem(ur_mem_handle_t Parent, BufferMem::AllocMode Mode, void *Ptr, + void *HostPtr, size_t Size) + : Parent{Parent}, Ptr{Ptr}, HostPtr{HostPtr}, Size{Size}, + MemAllocMode{Mode} {}; + + void *get() const noexcept { return Ptr; } + size_t getSize() const noexcept { return Size; } +}; + +struct ur_mem_handle_t_ : RefCounted { + ur_context_handle_t Context; + + enum class Type { Buffer } MemType; + ur_mem_flags_t MemFlags; + + // For now we only support BufferMem. Eventually we'll support images, so use + // a variant to store the underlying object. + std::variant Mem; + + ur_mem_handle_t_(ur_context_handle_t Context, ur_mem_handle_t Parent, + ur_mem_flags_t MemFlags, BufferMem::AllocMode Mode, + void *Ptr, void *HostPtr, size_t Size) + : Context{Context}, MemType{Type::Buffer}, MemFlags{MemFlags}, + Mem{BufferMem{Parent, Mode, Ptr, HostPtr, Size}} { + urContextRetain(Context); + }; + + ~ur_mem_handle_t_() { urContextRelease(Context); } + + ur_context_handle_t getContext() const noexcept { return Context; } +}; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index 0e12c6206dd84..eed6dcbd63692 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -31,6 +31,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( } Queue->OffloadDevice = hDevice->OffloadDevice; + Queue->Context = hContext; *phQueue = Queue; diff --git a/unified-runtime/source/adapters/offload/queue.hpp b/unified-runtime/source/adapters/offload/queue.hpp index 6afe4bf15098e..20812b3cf6f4f 100644 --- a/unified-runtime/source/adapters/offload/queue.hpp +++ b/unified-runtime/source/adapters/offload/queue.hpp @@ -18,4 +18,5 @@ struct ur_queue_handle_t_ : RefCounted { ol_queue_handle_t OffloadQueue; ol_device_handle_t OffloadDevice; + ur_context_handle_t Context; }; diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index 87c7b6780065c..49987ac9719e9 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -149,16 +149,16 @@ urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { if (UR_RESULT_SUCCESS != result) { return result; } - pDdiTable->pfnBufferCreate = nullptr; + pDdiTable->pfnBufferCreate = urMemBufferCreate; pDdiTable->pfnBufferPartition = nullptr; pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; pDdiTable->pfnImageCreateWithNativeHandle = nullptr; - pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetInfo = urMemGetInfo; pDdiTable->pfnGetNativeHandle = nullptr; pDdiTable->pfnImageCreate = nullptr; pDdiTable->pfnImageGetInfo = nullptr; - pDdiTable->pfnRelease = nullptr; - pDdiTable->pfnRetain = nullptr; + pDdiTable->pfnRelease = urMemRelease; + pDdiTable->pfnRetain = urMemRetain; return UR_RESULT_SUCCESS; } @@ -177,9 +177,9 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( pDdiTable->pfnMemBufferCopyRect = nullptr; pDdiTable->pfnMemBufferFill = nullptr; pDdiTable->pfnMemBufferMap = nullptr; - pDdiTable->pfnMemBufferRead = nullptr; + pDdiTable->pfnMemBufferRead = urEnqueueMemBufferRead; pDdiTable->pfnMemBufferReadRect = nullptr; - pDdiTable->pfnMemBufferWrite = nullptr; + pDdiTable->pfnMemBufferWrite = urEnqueueMemBufferWrite; pDdiTable->pfnMemBufferWriteRect = nullptr; pDdiTable->pfnMemImageCopy = nullptr; pDdiTable->pfnMemImageRead = nullptr; diff --git a/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp b/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp index f2944eb5d1ef3..bf77c8004f4cc 100644 --- a/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp +++ b/unified-runtime/test/conformance/memory/urMemBufferCreate.cpp @@ -121,8 +121,10 @@ TEST_P(urMemBufferCreateTest, CopyHostPointer) { TEST_P(urMemBufferCreateTest, UseHostPointer) { // These all copy memory instead of mapping it + // https://github.com/intel/llvm/issues/18836 UUR_KNOWN_FAILURE_ON(uur::LevelZero{}, uur::LevelZeroV2{}, uur::HIP{}, - uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"}); + uur::CUDA{}, uur::OpenCL{"Intel(R) UHD Graphics 770"}, + uur::Offload{}); std::vector dataWrite{}; dataWrite.resize(4096); From 3341f1a3adfbc7ff7adc7b9162920cb5e4a5c8b6 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 6 Jun 2025 15:43:39 +0100 Subject: [PATCH 2/4] Check ol* results --- .../source/adapters/offload/enqueue.cpp | 15 ++++++++---- .../source/adapters/offload/memory.cpp | 23 +++++++++++++++---- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index fe6db0a9da83e..361c3a95bc20b 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -137,12 +137,19 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( void *DevPtr = std::get(hBuffer->Mem).Ptr; // TODO: olMemcpy src should be const - olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice, - const_cast(pSrc), hQueue->Context->OffloadHost, size, - phEvent ? &EventOut : nullptr); + auto Res = + olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice, + const_cast(pSrc), hQueue->Context->OffloadHost, size, + phEvent ? &EventOut : nullptr); + if (Res) { + return offloadResultToUR(Res); + } if (blockingWrite) { - olWaitQueue(hQueue->OffloadQueue); + auto Res = olWaitQueue(hQueue->OffloadQueue); + if (Res) { + return offloadResultToUR(Res); + } } if (phEvent) { diff --git a/unified-runtime/source/adapters/offload/memory.cpp b/unified-runtime/source/adapters/offload/memory.cpp index bad7e40f02c9d..0ab73efbb3fd8 100644 --- a/unified-runtime/source/adapters/offload/memory.cpp +++ b/unified-runtime/source/adapters/offload/memory.cpp @@ -16,6 +16,7 @@ #include "context.hpp" #include "device.hpp" #include "memory.hpp" +#include "ur2offload.hpp" UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, @@ -33,14 +34,20 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( auto AllocMode = BufferMem::AllocMode::Default; if (flags & UR_MEM_FLAG_ALLOC_HOST_POINTER) { - olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_HOST, size, &HostPtr); + auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_HOST, size, &HostPtr); + if (Res) { + return offloadResultToUR(Res); + } // TODO: We (probably) need something like cuMemHostGetDevicePointer // for this to work everywhere. For now assume the managed host pointer is // device-accessible. Ptr = HostPtr; AllocMode = BufferMem::AllocMode::AllocHostPtr; } else { - olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_DEVICE, size, &Ptr); + auto Res = olMemAlloc(OffloadDevice, OL_ALLOC_TYPE_DEVICE, size, &Ptr); + if (Res) { + return offloadResultToUR(Res); + } if (flags & UR_MEM_FLAG_ALLOC_COPY_HOST_POINTER) { AllocMode = BufferMem::AllocMode::CopyIn; } @@ -51,8 +58,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( hContext, ParentBuffer, flags, AllocMode, Ptr, HostPtr, size}); if (PerformInitialCopy) { - olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr, hContext->OffloadHost, size, - nullptr); + auto Res = olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr, + hContext->OffloadHost, size, nullptr); + if (Res) { + return offloadResultToUR(Res); + } } *phBuffer = URMemObj.release(); @@ -74,7 +84,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemRelease(ur_mem_handle_t hMem) { if (hMem->MemType == ur_mem_handle_t_::Type::Buffer) { // TODO: Handle registered host memory auto &BufferImpl = std::get(MemObjPtr->Mem); - olMemFree(BufferImpl.Ptr); + auto Res = olMemFree(BufferImpl.Ptr); + if (Res) { + return offloadResultToUR(Res); + } } return UR_RESULT_SUCCESS; From a1f7e52ca2e7d7473386e883d7981d3b2527aaf0 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 6 Jun 2025 15:47:34 +0100 Subject: [PATCH 3/4] Don't pass the host device around --- unified-runtime/source/adapters/offload/context.hpp | 3 --- unified-runtime/source/adapters/offload/enqueue.cpp | 12 +++++------- unified-runtime/source/adapters/offload/memory.cpp | 3 ++- unified-runtime/source/adapters/offload/queue.cpp | 1 - unified-runtime/source/adapters/offload/queue.hpp | 1 - 5 files changed, 7 insertions(+), 13 deletions(-) diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp index 69295c0499479..38857446c47f8 100644 --- a/unified-runtime/source/adapters/offload/context.hpp +++ b/unified-runtime/source/adapters/offload/context.hpp @@ -20,12 +20,9 @@ struct ur_context_handle_t_ : RefCounted { ur_context_handle_t_(ur_device_handle_t hDevice) : Device{hDevice} { urDeviceRetain(Device); - // For convenience, store the host device in the context - OffloadHost = Adapter.HostDevice; } ~ur_context_handle_t_() { urDeviceRelease(Device); } ur_device_handle_t Device; std::unordered_map AllocTypeMap; - ol_device_handle_t OffloadHost; }; diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 361c3a95bc20b..649ca4810ece4 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -105,9 +105,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead( void *DevPtr = std::get(hBuffer->Mem).Ptr; - olMemcpy(hQueue->OffloadQueue, pDst, hQueue->Context->OffloadHost, - DevPtr + offset, hQueue->OffloadDevice, size, - phEvent ? &EventOut : nullptr); + olMemcpy(hQueue->OffloadQueue, pDst, Adapter.HostDevice, DevPtr + offset, + hQueue->OffloadDevice, size, phEvent ? &EventOut : nullptr); if (blockingRead) { olWaitQueue(hQueue->OffloadQueue); @@ -137,10 +136,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( void *DevPtr = std::get(hBuffer->Mem).Ptr; // TODO: olMemcpy src should be const - auto Res = - olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice, - const_cast(pSrc), hQueue->Context->OffloadHost, size, - phEvent ? &EventOut : nullptr); + auto Res = olMemcpy(hQueue->OffloadQueue, DevPtr + offset, + hQueue->OffloadDevice, const_cast(pSrc), + Adapter.HostDevice, size, phEvent ? &EventOut : nullptr); if (Res) { return offloadResultToUR(Res); } diff --git a/unified-runtime/source/adapters/offload/memory.cpp b/unified-runtime/source/adapters/offload/memory.cpp index 0ab73efbb3fd8..29a0a07a95492 100644 --- a/unified-runtime/source/adapters/offload/memory.cpp +++ b/unified-runtime/source/adapters/offload/memory.cpp @@ -13,6 +13,7 @@ #include #include +#include "adapter.hpp" #include "context.hpp" #include "device.hpp" #include "memory.hpp" @@ -59,7 +60,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( if (PerformInitialCopy) { auto Res = olMemcpy(nullptr, Ptr, OffloadDevice, HostPtr, - hContext->OffloadHost, size, nullptr); + Adapter.HostDevice, size, nullptr); if (Res) { return offloadResultToUR(Res); } diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index eed6dcbd63692..0e12c6206dd84 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -31,7 +31,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( } Queue->OffloadDevice = hDevice->OffloadDevice; - Queue->Context = hContext; *phQueue = Queue; diff --git a/unified-runtime/source/adapters/offload/queue.hpp b/unified-runtime/source/adapters/offload/queue.hpp index 20812b3cf6f4f..6afe4bf15098e 100644 --- a/unified-runtime/source/adapters/offload/queue.hpp +++ b/unified-runtime/source/adapters/offload/queue.hpp @@ -18,5 +18,4 @@ struct ur_queue_handle_t_ : RefCounted { ol_queue_handle_t OffloadQueue; ol_device_handle_t OffloadDevice; - ur_context_handle_t Context; }; From 0fdfba2a6a56cebc1c4b483af5d8df9b25c59b9b Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Fri, 6 Jun 2025 16:45:34 +0100 Subject: [PATCH 4/4] Remove const_cast --- unified-runtime/source/adapters/offload/enqueue.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 649ca4810ece4..5dc1e931bca9f 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -135,10 +135,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite( void *DevPtr = std::get(hBuffer->Mem).Ptr; - // TODO: olMemcpy src should be const - auto Res = olMemcpy(hQueue->OffloadQueue, DevPtr + offset, - hQueue->OffloadDevice, const_cast(pSrc), - Adapter.HostDevice, size, phEvent ? &EventOut : nullptr); + auto Res = + olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice, + pSrc, Adapter.HostDevice, size, phEvent ? &EventOut : nullptr); if (Res) { return offloadResultToUR(Res); }