[Offload] Update allocations to include device #154733

RossBrunton · 2025-08-21T11:03:40Z

To get around some memory issues with aliasing pointers, device
allocations need to be linked with their allocating device. olMemFree
now requires a device pointer to be provided for device allocations.

llvmbot · 2025-08-21T11:04:11Z

@llvm/pr-subscribers-offload

Author: Ross Brunton (RossBrunton)

Changes

To get around some memory issues with aliasing pointers, device
allocations need to be linked with their allocating device. olMemFree
now requires a device pointer to be provided for device allocations.

Full diff: https://github.com/llvm/llvm-project/pull/154733.diff

7 Files Affected:

(modified) offload/liboffload/API/Memory.td (+15-4)
(modified) offload/liboffload/src/OffloadImpl.cpp (+26-18)
(modified) offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp (+8-8)
(modified) offload/unittests/OffloadAPI/memory/olMemAlloc.cpp (+3-3)
(modified) offload/unittests/OffloadAPI/memory/olMemFree.cpp (+39-5)
(modified) offload/unittests/OffloadAPI/memory/olMemcpy.cpp (+10-10)
(modified) offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp (+1-1)

diff --git a/offload/liboffload/API/Memory.td b/offload/liboffload/API/Memory.td
index 5f7158588bc77..a904364075ba8 100644
--- a/offload/liboffload/API/Memory.td
+++ b/offload/liboffload/API/Memory.td
@@ -14,15 +14,18 @@ def : Enum {
   let name = "ol_alloc_type_t";
   let desc = "Represents the type of allocation made with olMemAlloc.";
   let etors = [
-    Etor<"HOST", "Host allocation">,
-    Etor<"DEVICE", "Device allocation">,
-    Etor<"MANAGED", "Managed allocation">
+    Etor<"HOST", "Host allocation. Allocated on the host and visible to the host and all devices sharing the same platform.">,
+    Etor<"DEVICE", "Device allocation. Allocated on a specific device and visible only to that device.">,
+    Etor<"MANAGED", "Managed allocation. Allocated on a specific device and visible to the host and all devices sharing the same platform.">
   ];
 }
 
 def : Function {
   let name = "olMemAlloc";
   let desc = "Creates a memory allocation on the specified device.";
+  let details = [
+      "`DEVICE` allocations do not share the same address space as the host or other devices. The `AllocationOut` pointer cannot be used to uniquely identify the allocation in these cases.",
+  ];
   let params = [
     Param<"ol_device_handle_t", "Device", "handle of the device to allocate on", PARAM_IN>,
     Param<"ol_alloc_type_t", "Type", "type of the allocation", PARAM_IN>,
@@ -39,10 +42,18 @@ def : Function {
 def : Function {
   let name = "olMemFree";
   let desc = "Frees a memory allocation previously made by olMemAlloc.";
+  let details = [
+      "`Address` must be the beginning of the allocation.",
+      "`Device` must be provided for memory allocated as `OL_ALLOC_TYPE_DEVICE`, and may be provided for other types.",
+      "If `Device` is provided, it must match the device used to allocate the memory with `olMemAlloc`.",
+  ];
   let params = [
+    Param<"ol_device_handle_t", "Device", "handle of the device this allocation was allocated on", PARAM_IN_OPTIONAL>,
     Param<"void*", "Address", "address of the allocation to free", PARAM_IN>,
   ];
-  let returns = [];
+  let returns = [
+    Return<"OL_ERRC_NOT_FOUND", ["The address was not found in the list of allocations"]>
+  ];
 }
 
 def : Function {
diff --git a/offload/liboffload/src/OffloadImpl.cpp b/offload/liboffload/src/OffloadImpl.cpp
index 49154337dd193..da48378547974 100644
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -106,6 +106,7 @@ namespace llvm {
 namespace offload {
 
 struct AllocInfo {
+  void *Base;
   ol_device_handle_t Device;
   ol_alloc_type_t Type;
 };
@@ -124,8 +125,8 @@ struct OffloadContext {
 
   bool TracingEnabled = false;
   bool ValidationEnabled = true;
-  DenseMap<void *, AllocInfo> AllocInfoMap{};
-  std::mutex AllocInfoMapMutex{};
+  SmallVector<AllocInfo> AllocInfoList{};
+  std::mutex AllocInfoListMutex{};
   SmallVector<ol_platform_impl_t, 4> Platforms{};
   size_t RefCount;
 
@@ -535,30 +536,37 @@ Error olMemAlloc_impl(ol_device_handle_t Device, ol_alloc_type_t Type,
 
   *AllocationOut = *Alloc;
   {
-    std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
-    OffloadContext::get().AllocInfoMap.insert_or_assign(
-        *Alloc, AllocInfo{Device, Type});
+    std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoListMutex);
+    OffloadContext::get().AllocInfoList.emplace_back(
+        AllocInfo{*AllocationOut, Device, Type});
   }
   return Error::success();
 }
 
-Error olMemFree_impl(void *Address) {
-  ol_device_handle_t Device;
-  ol_alloc_type_t Type;
+Error olMemFree_impl(ol_device_handle_t Device, void *Address) {
+  AllocInfo Removed;
   {
-    std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoMapMutex);
-    if (!OffloadContext::get().AllocInfoMap.contains(Address))
-      return createOffloadError(ErrorCode::INVALID_ARGUMENT,
-                                "address is not a known allocation");
+    std::lock_guard<std::mutex> Lock(OffloadContext::get().AllocInfoListMutex);
+
+    auto &List = OffloadContext::get().AllocInfoList;
+    auto Entry = std::find_if(List.begin(), List.end(), [&](AllocInfo &Entry) {
+      return Address == Entry.Base && (!Device || Entry.Device == Device);
+    });
+
+    if (Entry == List.end())
+      return Plugin::error(ErrorCode::NOT_FOUND,
+                           "could not find memory allocated by olMemAlloc");
+    if (!Device && Entry->Type == OL_ALLOC_TYPE_DEVICE)
+      return Plugin::error(
+          ErrorCode::NOT_FOUND,
+          "specifying the Device parameter is required to query device memory");
 
-    auto AllocInfo = OffloadContext::get().AllocInfoMap.at(Address);
-    Device = AllocInfo.Device;
-    Type = AllocInfo.Type;
-    OffloadContext::get().AllocInfoMap.erase(Address);
+    Removed = std::move(*Entry);
+    *Entry = List.pop_back_val();
   }
 
-  if (auto Res =
-          Device->Device->dataDelete(Address, convertOlToPluginAllocTy(Type)))
+  if (auto Res = Removed.Device->Device->dataDelete(
+          Removed.Base, convertOlToPluginAllocTy(Removed.Type)))
     return Res;
 
   return Error::success();
diff --git a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
index 1dac8c50271b5..a7f4881bcc709 100644
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -101,7 +101,7 @@ TEST_P(olLaunchKernelFooTest, Success) {
     ASSERT_EQ(Data[i], i);
   }
 
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
@@ -123,7 +123,7 @@ TEST_P(olLaunchKernelFooTest, SuccessThreaded) {
       ASSERT_EQ(Data[i], i);
     }
 
-    ASSERT_SUCCESS(olMemFree(Mem));
+    ASSERT_SUCCESS(olMemFree(Device, Mem));
   });
 }
 
@@ -151,7 +151,7 @@ TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
     ASSERT_EQ(Data[i], i);
   }
 
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchKernelLocalMemTest, Success) {
@@ -176,7 +176,7 @@ TEST_P(olLaunchKernelLocalMemTest, Success) {
   for (uint32_t i = 0; i < LaunchArgs.GroupSize.x * LaunchArgs.NumGroups.x; i++)
     ASSERT_EQ(Data[i], (i % 64) * 2);
 
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
@@ -199,7 +199,7 @@ TEST_P(olLaunchKernelLocalMemReductionTest, Success) {
   for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
     ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
 
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
@@ -222,7 +222,7 @@ TEST_P(olLaunchKernelLocalMemStaticTest, Success) {
   for (uint32_t i = 0; i < LaunchArgs.NumGroups.x; i++)
     ASSERT_EQ(Data[i], 2 * LaunchArgs.GroupSize.x);
 
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchKernelGlobalTest, Success) {
@@ -245,7 +245,7 @@ TEST_P(olLaunchKernelGlobalTest, Success) {
     ASSERT_EQ(Data[i], i * 2);
   }
 
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchKernelGlobalTest, InvalidNotAKernel) {
@@ -273,7 +273,7 @@ TEST_P(olLaunchKernelGlobalCtorTest, Success) {
     ASSERT_EQ(Data[i], i + 100);
   }
 
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchKernelGlobalDtorTest, Success) {
diff --git a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
index 00e428ec2abc7..c1d585d7271f3 100644
--- a/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemAlloc.cpp
@@ -17,21 +17,21 @@ TEST_P(olMemAllocTest, SuccessAllocManaged) {
   void *Alloc = nullptr;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 1024, &Alloc));
   ASSERT_NE(Alloc, nullptr);
-  olMemFree(Alloc);
+  olMemFree(Device, Alloc);
 }
 
 TEST_P(olMemAllocTest, SuccessAllocHost) {
   void *Alloc = nullptr;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, &Alloc));
   ASSERT_NE(Alloc, nullptr);
-  olMemFree(Alloc);
+  olMemFree(Device, Alloc);
 }
 
 TEST_P(olMemAllocTest, SuccessAllocDevice) {
   void *Alloc = nullptr;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
   ASSERT_NE(Alloc, nullptr);
-  olMemFree(Alloc);
+  olMemFree(Device, Alloc);
 }
 
 TEST_P(olMemAllocTest, InvalidNullDevice) {
diff --git a/offload/unittests/OffloadAPI/memory/olMemFree.cpp b/offload/unittests/OffloadAPI/memory/olMemFree.cpp
index dfaf9bdef3189..8618e740f02bd 100644
--- a/offload/unittests/OffloadAPI/memory/olMemFree.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemFree.cpp
@@ -16,24 +16,58 @@ OFFLOAD_TESTS_INSTANTIATE_DEVICE_FIXTURE(olMemFreeTest);
 TEST_P(olMemFreeTest, SuccessFreeManaged) {
   void *Alloc = nullptr;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 1024, &Alloc));
-  ASSERT_SUCCESS(olMemFree(Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, Alloc));
+}
+
+TEST_P(olMemFreeTest, SuccessFreeManagedNull) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 1024, &Alloc));
+  ASSERT_SUCCESS(olMemFree(nullptr, Alloc));
 }
 
 TEST_P(olMemFreeTest, SuccessFreeHost) {
   void *Alloc = nullptr;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, &Alloc));
-  ASSERT_SUCCESS(olMemFree(Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, Alloc));
+}
+
+TEST_P(olMemFreeTest, SuccessFreeHostNull) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, &Alloc));
+  ASSERT_SUCCESS(olMemFree(nullptr, Alloc));
 }
 
 TEST_P(olMemFreeTest, SuccessFreeDevice) {
   void *Alloc = nullptr;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
-  ASSERT_SUCCESS(olMemFree(Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, Alloc));
 }
 
 TEST_P(olMemFreeTest, InvalidNullPtr) {
+  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, olMemFree(Device, nullptr));
+}
+
+TEST_P(olMemFreeTest, InvalidFreeDeviceNull) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
+  ASSERT_ERROR(OL_ERRC_NOT_FOUND, olMemFree(nullptr, Alloc));
+}
+
+TEST_P(olMemFreeTest, InvalidFreeManagedWrongDevice) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED, 1024, &Alloc));
+  ASSERT_ERROR(OL_ERRC_NOT_FOUND, olMemFree(TestEnvironment::getHostDevice(), Alloc));
+}
+
+TEST_P(olMemFreeTest, InvalidFreeHostWrongDevice) {
+  void *Alloc = nullptr;
+  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_HOST, 1024, &Alloc));
+  ASSERT_ERROR(OL_ERRC_NOT_FOUND, olMemFree(TestEnvironment::getHostDevice(), Alloc));
+}
+
+
+TEST_P(olMemFreeTest, InvalidFreeDeviceWrongDevice) {
   void *Alloc = nullptr;
   ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_DEVICE, 1024, &Alloc));
-  ASSERT_ERROR(OL_ERRC_INVALID_NULL_POINTER, olMemFree(nullptr));
-  ASSERT_SUCCESS(olMemFree(Alloc));
+  ASSERT_ERROR(OL_ERRC_NOT_FOUND, olMemFree(TestEnvironment::getHostDevice(), Alloc));
 }
diff --git a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
index cc67d782ef403..d028099916848 100644
--- a/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
+++ b/offload/unittests/OffloadAPI/memory/olMemcpy.cpp
@@ -46,7 +46,7 @@ TEST_P(olMemcpyTest, SuccessHtoD) {
   std::vector<uint8_t> Input(Size, 42);
   ASSERT_SUCCESS(olMemcpy(Queue, Alloc, Device, Input.data(), Host, Size));
   olSyncQueue(Queue);
-  olMemFree(Alloc);
+  olMemFree(Device, Alloc);
 }
 
 TEST_P(olMemcpyTest, SuccessDtoH) {
@@ -62,7 +62,7 @@ TEST_P(olMemcpyTest, SuccessDtoH) {
   for (uint8_t Val : Output) {
     ASSERT_EQ(Val, 42);
   }
-  ASSERT_SUCCESS(olMemFree(Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, Alloc));
 }
 
 TEST_P(olMemcpyTest, SuccessDtoD) {
@@ -81,8 +81,8 @@ TEST_P(olMemcpyTest, SuccessDtoD) {
   for (uint8_t Val : Output) {
     ASSERT_EQ(Val, 42);
   }
-  ASSERT_SUCCESS(olMemFree(AllocA));
-  ASSERT_SUCCESS(olMemFree(AllocB));
+  ASSERT_SUCCESS(olMemFree(Device, AllocA));
+  ASSERT_SUCCESS(olMemFree(Device, AllocB));
 }
 
 TEST_P(olMemcpyTest, SuccessHtoHSync) {
@@ -110,7 +110,7 @@ TEST_P(olMemcpyTest, SuccessDtoHSync) {
   for (uint8_t Val : Output) {
     ASSERT_EQ(Val, 42);
   }
-  ASSERT_SUCCESS(olMemFree(Alloc));
+  ASSERT_SUCCESS(olMemFree(Device, Alloc));
 }
 
 TEST_P(olMemcpyTest, SuccessSizeZero) {
@@ -146,8 +146,8 @@ TEST_P(olMemcpyGlobalTest, SuccessRoundTrip) {
   for (uint32_t I = 0; I < 64; I++)
     ASSERT_EQ(DestData[I], I);
 
-  ASSERT_SUCCESS(olMemFree(DestMem));
-  ASSERT_SUCCESS(olMemFree(SourceMem));
+  ASSERT_SUCCESS(olMemFree(Device, DestMem));
+  ASSERT_SUCCESS(olMemFree(Device, SourceMem));
 }
 
 TEST_P(olMemcpyGlobalTest, SuccessWrite) {
@@ -178,8 +178,8 @@ TEST_P(olMemcpyGlobalTest, SuccessWrite) {
   for (uint32_t I = 0; I < 64; I++)
     ASSERT_EQ(DestData[I], I);
 
-  ASSERT_SUCCESS(olMemFree(DestMem));
-  ASSERT_SUCCESS(olMemFree(SourceMem));
+  ASSERT_SUCCESS(olMemFree(Device, DestMem));
+  ASSERT_SUCCESS(olMemFree(Device, SourceMem));
 }
 
 TEST_P(olMemcpyGlobalTest, SuccessRead) {
@@ -199,5 +199,5 @@ TEST_P(olMemcpyGlobalTest, SuccessRead) {
   for (uint32_t I = 0; I < 64; I++)
     ASSERT_EQ(DestData[I], I * 2);
 
-  ASSERT_SUCCESS(olMemFree(DestMem));
+  ASSERT_SUCCESS(olMemFree(Device, DestMem));
 }
diff --git a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
index aa86750f6adf9..2f3fda6fb729b 100644
--- a/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
+++ b/offload/unittests/OffloadAPI/queue/olLaunchHostFunction.cpp
@@ -93,7 +93,7 @@ TEST_P(olLaunchHostFunctionKernelTest, SuccessBlocking) {
   }
 
   ASSERT_SUCCESS(olDestroyQueue(Queue));
-  ASSERT_SUCCESS(olMemFree(Mem));
+  ASSERT_SUCCESS(olMemFree(Device, Mem));
 }
 
 TEST_P(olLaunchHostFunctionTest, InvalidNullCallback) {

RossBrunton · 2025-08-21T11:05:27Z

offload/liboffload/src/OffloadImpl.cpp

I considered having this be a Map<void *, SmallVector<AllocInfo>>, however I think it's better if functions like olMemGetInfo accept any address inside the allocation instead of just the start. This makes that logic much less clunky.

however I think it's better if functions like olMemGetInfo accept any address inside the allocation instead of just the start.

Why? I don't think we have such use case in SYCL. Does OpenMP?
This decision makes the implementation more complicated, and I'm not sure if there's much value.

Currently, the implementation of urEnqueueUSMMemcpy contains this:

auto GetDevice = [&](const void *Ptr) { auto Res = hQueue->UrContext->getAllocType(Ptr); if (!Res) return Adapter->HostDevice; return Res->Type == OL_ALLOC_TYPE_HOST ? Adapter->HostDevice : hQueue->OffloadDevice; }; return doMemcpy(UR_COMMAND_USM_MEMCPY, hQueue, pDst, GetDevice(pDst), pSrc, GetDevice(pSrc), size, blocking, numEventsInWaitList, phEventWaitList, phEvent);

We need to look up whether the allocation is from the host or the device inside liboffload. Afaik, all UR functions working on USM pointers don't require the pointer to be at the start of an allocation, so we'd need to be able to look up the allocation type from anywhere in the buffer.

What's the reason olMemcpy needs to take destination/source devices? L0/ur/opencl/cuda don't. The underlying APIs all seem to support inferring the devices types implicitly.

UR does require the destination and source device - It needs to know which backend (AMD/CUDA/OpenCL/etc) to dispatch to, which is stored in ur_queue_handle_t. OpenCL also requires the cl_intel_unified_shared_memory extension to query the source and destination device.

Since the queue is optional in olMemcpy, a device to lookup the backend (AMD or Nvidia) is required. And if we need one device, we may as well have the user specify both instead of requiring the backend to support determining whether the copy is h2d/h2h/d2d. It also allows the API to be extended in the future to allow copying data between two platforms if needed.

I wasn't involved in the initial design for this, so @jhuber6 and @callumfare might be able to shed some more light on why it is how it is.

UR does require the destination and source device - It needs to know which backend (AMD/CUDA/OpenCL/etc) to dispatch to, which is stored in ur_queue_handle_t.

Not sure I follow, here's the UR entry point:

UR_APIEXPORT ur_result_t UR_APICALL urEnqueueUSMMemcpy( ur_queue_handle_t hQueue, bool blocking, void *pDst, const void *pSrc, size_t size, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent);

Yes, we need a hQueue to do the dispatch, but it's there because memcpy is logically an operation on the queue.

Since the queue is optional in olMemcpy, a device to lookup the backend (AMD or Nvidia) is required. And if we need one device, we may as well have the user specify both instead of requiring the backend to support determining whether the copy is h2d/h2h/d2d.

I think we should revisit the decision for queue to be optional. Given the current API, the best we can probably do is something like an interval map lookup (2x for both src and dst) - which is going to be both complex and expensive. Do we really want to pay the cost of doing those lookups for every usm memory operation?

In UR, we have a separate non-queued blocking copy that operates on a context:

UR_APIEXPORT ur_result_t UR_APICALL urUSMContextMemcpyExp( /// [in] Context associated with the device(s) that own the allocations /// `pSrc` and `pDst`. ur_context_handle_t hContext, void *pDst, const void *pSrc, size_t size);

We could do something similar instead of trying to overload a single function with multiple different functionalities.

github-actions · 2025-08-21T11:06:58Z

✅ With the latest revision this PR passed the C/C++ code formatter.

pbalcer · 2025-08-26T09:55:38Z

offload/liboffload/src/OffloadImpl.cpp

I'm concerned about a linear search over all allocations during every free. I don't really understand the argument against using a map. You are doing an exact lookup here.

In GetMemInfo, if you really need to be able to find this for any arbitrary address, you can search for greater equals key in the map and look at the allocation size.

I originally considered using std::pair<ol_device_handle_t, AllocInfo> as the key or the value being a SmallVector<AllocInfo> containing the device. However, that caused the implementation to spiral out in complexity and become a mess.

I'm not sure there'd be enough allocations to create a noticeable performance difference.

It makes liboffload's free (and, really, based on your other comment, any function that accepts an USM pointer) performance to be O(N) where N is the number of allocations. Unless we can reasonably say that N is going to be very small, and I'm not sure we can, because e.g., SYCL doesn't restrict users from arbitrarily allocating memory, there will be a noticeable performance difference at least for some applications.

pbalcer · 2025-08-26T10:01:55Z

offload/liboffload/src/OffloadImpl.cpp

however I think it's better if functions like olMemGetInfo accept any address inside the allocation instead of just the start.

Why? I don't think we have such use case in SYCL. Does OpenMP?
This decision makes the implementation more complicated, and I'm not sure if there's much value.

kevinsala · 2025-08-30T00:46:51Z

offload/liboffload/src/OffloadImpl.cpp

Is it necessary to have the allocations of all devices in the same structure? I couldn't find a use for that requirement in the current code.

If it's not necessary, you could have a map in each device object with their own allocations. Keeping a map to speedup lookups, and improving concurrency between operations of distinct devices.

In theory it's possible to allocate host memory which isn't tied to a specific device, although we don't expose that functionality in any liboffload entry points yet.

However, what we do allow is looking up host and managed allocations without providing a device. To do that where each device managed its own allocations we'd need to iterate (and lock) through each device and search its allocation list.

Thanks for the clarification. I would use a map here instead of the vector, as @pbalcer said.

If you need lookups using non-base pointers (e.g., for a olGetMemInfo), you could use std::map with a base pointer + size as the key, and use lower_bound/upper_bound functions.

I'm not sure it's possible to use a binary search to scan through allocations since memory regions can overlap in weird ways that make it hard to establish an order.

To get around some memory issues with aliasing pointers, device allocations need to be linked with their allocating device. `olMemFree` now requires a device pointer to be provided for device allocations.

RossBrunton · 2025-09-17T09:21:52Z

Superseded by #157478

llvmbot added the offload label Aug 21, 2025

RossBrunton commented Aug 21, 2025

View reviewed changes

RossBrunton requested review from callumfare and jhuber6 August 21, 2025 11:05

pbalcer reviewed Aug 26, 2025

View reviewed changes

kevinsala reviewed Aug 30, 2025

View reviewed changes

RossBrunton added 4 commits September 3, 2025 11:02

[Offload] Update allocations to include device

071ae75

To get around some memory issues with aliasing pointers, device allocations need to be linked with their allocating device. `olMemFree` now requires a device pointer to be provided for device allocations.

Clang format

5c51f00

Clean up tests

4944a92

Update tests after rebase

ef99044

RossBrunton force-pushed the newMemFree branch from 5a59d0e to ef99044 Compare September 3, 2025 10:58

RossBrunton closed this Sep 17, 2025

[Offload] Update allocations to include device #154733

[Offload] Update allocations to include device #154733

Uh oh!

Conversation

RossBrunton commented Aug 21, 2025

Uh oh!

llvmbot commented Aug 21, 2025

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

pbalcer Sep 1, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Choose a reason for hiding this comment

Uh oh!

github-actions bot commented Aug 21, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

Choose a reason for hiding this comment

Uh oh!

RossBrunton commented Sep 17, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

4 participants

pbalcer Sep 1, 2025 •

edited

Loading

github-actions bot commented Aug 21, 2025 •

edited

Loading