[Offload] Remove handling for device memory pool #163629

jhuber6 · 2025-10-15T21:11:35Z

Summary:
This was a lot of code that was only used for upstream LLVM builds of
AMDGPU offloading. We have a generic and fast malloc in libc now so
just use that. Simplifies code, can be added back if we start providing
alternate forms but I don't think there's a single use-case that would
justify it yet.

llvmbot · 2025-10-15T21:12:12Z

@llvm/pr-subscribers-backend-amdgpu

Author: Joseph Huber (jhuber6)

Changes

Summary:
This was a lot of code that was only used for upstream LLVM builds of
AMDGPU offloading. We have a generic and fast malloc in libc now so
just use that. Simplifies code, can be added back if we start providing
alternate forms but I don't think there's a single use-case that would
justify it yet.

Patch is 22.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163629.diff

15 Files Affected:

(modified) offload/include/Shared/Environment.h (-22)
(modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (-14)
(modified) offload/plugins-nextgen/common/include/PluginInterface.h (+9-15)
(modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+1-95)
(modified) offload/plugins-nextgen/cuda/src/rtl.cpp (-5)
(modified) offload/plugins-nextgen/host/src/rtl.cpp (-8)
(renamed) offload/test/libc/malloc_parallel.c ()
(modified) offload/test/mapping/lambda_mapping.cpp (+2)
(removed) offload/test/offloading/malloc.c (-37)
(modified) openmp/device/include/Allocator.h (-6)
(modified) openmp/device/src/Allocator.cpp (+34-33)
(modified) openmp/device/src/Kernel.cpp (-1)
(modified) openmp/device/src/Misc.cpp (+2-2)
(modified) openmp/device/src/State.cpp (+2-22)
(modified) openmp/docs/design/Runtimes.rst (-1)

diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..79e45fd8e082d 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
   Assertion = 1U << 0,
   FunctionTracing = 1U << 1,
   CommonIssues = 1U << 2,
-  AllocationTracker = 1U << 3,
   PGODump = 1U << 4,
 };
 
@@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
   uint64_t HardwareParallelism;
 };
 
-struct DeviceMemoryPoolTy {
-  void *Ptr;
-  uint64_t Size;
-};
-
-struct DeviceMemoryPoolTrackingTy {
-  uint64_t NumAllocations;
-  uint64_t AllocationTotal;
-  uint64_t AllocationMin;
-  uint64_t AllocationMax;
-
-  void combine(DeviceMemoryPoolTrackingTy &Other) {
-    NumAllocations += Other.NumAllocations;
-    AllocationTotal += Other.AllocationTotal;
-    AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
-                                                        : AllocationMin;
-    AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
-                                                        : AllocationMax;
-  }
-};
-
 // NOTE: Please don't change the order of those members as their indices are
 // used in the middle end. Always add the new data member at the end.
 // Different from KernelEnvironmentTy below, this structure contains members
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index a7723b8598815..4d827b3ad31e2 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3057,17 +3057,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     StackSize = Value;
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = DeviceMemoryPoolSize;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override {
-    for (DeviceImageTy *Image : LoadedImages)
-      if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
-        return Err;
-    DeviceMemoryPoolSize = Value;
-    return Plugin::success();
-  }
   Error getDeviceMemorySize(uint64_t &Value) override {
     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
       if (Pool->isGlobal()) {
@@ -3269,9 +3258,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Reference to the host device.
   AMDHostDeviceTy &HostDevice;
 
-  /// The current size of the global device memory pool (managed by us).
-  uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
-
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
   uint64_t StackSize = 16 * 1024 /* 16 KB */;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c530bba3882c..26723d94199e7 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -816,10 +816,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error unloadBinary(DeviceImageTy *Image);
   virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
 
-  /// Setup the global device memory pool, if the plugin requires one.
-  Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                              uint64_t PoolSize);
-
   // Setup the RPC server for this device if needed. This may not run on some
   // plugins like the CPU targets. By default, it will not be executed so it is
   // up to the target to override this using the shouldSetupRPCServer function.
@@ -1061,6 +1057,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   virtual Error getDeviceStackSize(uint64_t &V) = 0;
 
+  virtual Error getDeviceHeapSize(uint64_t &V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+  virtual Error setDeviceHeapSize(uint64_t V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+
   /// Returns true if current plugin architecture is an APU
   /// and unified_shared_memory was not requested by the program.
   bool useAutoZeroCopy();
@@ -1149,12 +1154,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
   virtual Error setDeviceStackSize(uint64_t V) = 0;
-  virtual Error getDeviceHeapSize(uint64_t &V) = 0;
-  virtual Error setDeviceHeapSize(uint64_t V) = 0;
-
-  /// Indicate whether the device should setup the global device memory pool. If
-  /// false is return the value on the device will be uninitialized.
-  virtual bool shouldSetupDeviceMemoryPool() const { return true; }
 
   /// Indicate whether or not the device should setup the RPC server. This is
   /// only necessary for unhosted targets like the GPU.
@@ -1175,7 +1174,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Int32Envar OMPX_DebugKind;
   UInt32Envar OMPX_SharedMemorySize;
   UInt64Envar OMPX_TargetStackSize;
-  UInt64Envar OMPX_TargetHeapSize;
 
   /// Environment flag to set the minimum number of threads we use for a
   /// low-trip count combined loop. Instead of using more threads we increase
@@ -1229,10 +1227,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Internal representation for OMPT device (initialize & finalize)
   std::atomic<bool> OmptInitialized;
 #endif
-
-private:
-  DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
-  DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index db43cbe49cc2b..b0a4113c2c3ec 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -708,7 +708,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       // Do not initialize the following two envars since they depend on the
       // device initialization. These cannot be consulted until the device is
       // initialized correctly. We initialize them in GenericDeviceTy::init().
-      OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
+      OMPX_TargetStackSize(),
       // By default, the initial number of streams and events is 1.
       OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
       OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
@@ -758,14 +758,6 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
     return StackSizeEnvarOrErr.takeError();
   OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
 
-  auto HeapSizeEnvarOrErr = UInt64Envar::create(
-      "LIBOMPTARGET_HEAP_SIZE",
-      [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
-      [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
-  if (!HeapSizeEnvarOrErr)
-    return HeapSizeEnvarOrErr.takeError();
-  OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
-
   // Update the maximum number of teams and threads after the device
   // initialization sets the corresponding hardware limit.
   if (OMP_NumTeams > 0)
@@ -791,19 +783,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
   if (auto Err = callGlobalDestructors(Plugin, *Image))
     return Err;
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-    DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-    GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                           sizeof(DeviceMemoryPoolTrackingTy),
-                           &ImageDeviceMemoryPoolTracking);
-    if (auto Err =
-            GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
-      consumeError(std::move(Err));
-    }
-    DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
-  }
-
   GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
   auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
   if (!ProfOrErr)
@@ -829,22 +808,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
       return Err;
   LoadedImages.clear();
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    // TODO: Write this by default into a file.
-    printf("\n\n|-----------------------\n"
-           "| Device memory tracker:\n"
-           "|-----------------------\n"
-           "| #Allocations: %lu\n"
-           "| Byes allocated: %lu\n"
-           "| Minimal allocation: %lu\n"
-           "| Maximal allocation: %lu\n"
-           "|-----------------------\n\n\n",
-           DeviceMemoryPoolTracking.NumAllocations,
-           DeviceMemoryPoolTracking.AllocationTotal,
-           DeviceMemoryPoolTracking.AllocationMin,
-           DeviceMemoryPoolTracking.AllocationMax);
-  }
-
   // Delete the memory manager before deinitializing the device. Otherwise,
   // we may delete device allocations after the device is deinitialized.
   if (MemoryManager)
@@ -897,18 +860,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   // Add the image to list.
   LoadedImages.push_back(Image);
 
-  // Setup the global device memory pool if needed.
-  if (!Plugin.getRecordReplay().isReplaying() &&
-      shouldSetupDeviceMemoryPool()) {
-    uint64_t HeapSize;
-    auto SizeOrErr = getDeviceHeapSize(HeapSize);
-    if (SizeOrErr) {
-      REPORT("No global device memory pool due to error: %s\n",
-             toString(std::move(SizeOrErr)).data());
-    } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
-      return std::move(Err);
-  }
-
   if (auto Err = setupRPCServer(Plugin, *Image))
     return std::move(Err);
 
@@ -932,51 +883,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   return Image;
 }
 
-Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
-                                             DeviceImageTy &Image,
-                                             uint64_t PoolSize) {
-  // Free the old pool, if any.
-  if (DeviceMemoryPool.Ptr) {
-    if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE))
-      return Err;
-  }
-
-  DeviceMemoryPool.Size = PoolSize;
-  auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE);
-  if (AllocOrErr) {
-    DeviceMemoryPool.Ptr = *AllocOrErr;
-  } else {
-    auto Err = AllocOrErr.takeError();
-    REPORT("Failure to allocate device memory for global memory pool: %s\n",
-           toString(std::move(Err)).data());
-    DeviceMemoryPool.Ptr = nullptr;
-    DeviceMemoryPool.Size = 0;
-  }
-
-  // Create the metainfo of the device environment global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (!GHandler.isSymbolInImage(*this, Image,
-                                "__omp_rtl_device_memory_pool_tracker")) {
-    DP("Skip the memory pool as there is no tracker symbol in the image.");
-    return Error::success();
-  }
-
-  GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                         sizeof(DeviceMemoryPoolTrackingTy),
-                         &DeviceMemoryPoolTracking);
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
-    return Err;
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
-                        sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
-
-  // Write device environment values to the device.
-  return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
-}
-
 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) {
   // The plugin either does not need an RPC server or it is unavailable.
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index db94f7f2dd995..3ef724328916a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1228,11 +1228,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  virtual bool shouldSetupDeviceMemoryPool() const override {
-    /// We use the CUDA malloc for now.
-    return false;
-  }
-
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index eb4ecac9907a1..48de1fefa29d6 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  /// This plugin should not setup the device environment or memory pool.
-  virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
-
   /// Getters and setters for stack size and heap size not relevant.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = 0;
@@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error setDeviceStackSize(uint64_t Value) override {
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = 0;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
 private:
   /// Grid values for Generic ELF64 plugins.
diff --git a/offload/test/offloading/malloc_parallel.c b/offload/test/libc/malloc_parallel.c
similarity index 100%
rename from offload/test/offloading/malloc_parallel.c
rename to offload/test/libc/malloc_parallel.c
diff --git a/offload/test/mapping/lambda_mapping.cpp b/offload/test/mapping/lambda_mapping.cpp
index 63b1719fbbc36..8e640b7fff3aa 100644
--- a/offload/test/mapping/lambda_mapping.cpp
+++ b/offload/test/mapping/lambda_mapping.cpp
@@ -4,6 +4,8 @@
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileoptxx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <iostream>
 
 template <typename LOOP_BODY>
diff --git a/offload/test/offloading/malloc.c b/offload/test/offloading/malloc.c
deleted file mode 100644
index 7b98e1f1110e5..0000000000000
--- a/offload/test/offloading/malloc.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
-// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic
-
-#include <stdio.h>
-#include <stdlib.h>
-
-int main() {
-  long unsigned *DP = 0;
-  int N = 32;
-  int Threads = 64;
-  int Teams = 10;
-
-  // Allocate ~55MB on the device.
-#pragma omp target map(from : DP)
-  DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
-
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads) is_device_ptr(DP)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      DP[i * N + j] = i + j;
-    }
-  }
-
-  long unsigned s = 0;
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads) reduction(+ : s)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      s += DP[i * N + j];
-    }
-  }
-
-  // CHECK: Sum: 6860800
-  printf("Sum: %li\n", s);
-  return 0;
-}
diff --git a/openmp/device/include/Allocator.h b/openmp/device/include/Allocator.h
index dc4d029ed75f3..507ec6327126a 100644
--- a/openmp/device/include/Allocator.h
+++ b/openmp/device/include/Allocator.h
@@ -14,18 +14,12 @@
 
 #include "DeviceTypes.h"
 
-// Forward declaration.
-struct KernelEnvironmentTy;
-
 namespace ompx {
 
 namespace allocator {
 
 static uint64_t constexpr ALIGNMENT = 16;
 
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
 /// Allocate \p Size bytes.
 [[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
 alloc(uint64_t Size);
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index aac2a6005158e..34c945c979ffb 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -18,42 +18,36 @@
 #include "Synchronization.h"
 
 using namespace ompx;
+using namespace allocator;
+
+// Provide a default implementation of malloc / free for AMDGPU platforms built
+// without 'libc' support.
+extern "C" {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
+[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
+#else
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+}
 
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
+static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
+alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};
 
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
+// Fallback bump pointer interface for platforms without a functioning
+// allocator.
 struct BumpAllocatorTy final {
+  uint64_t Offset = 0;
 
   void *alloc(uint64_t Size) {
     Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
 
-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
+    uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
+    if (OldData + Size >= MEMORY_SIZE)
       __builtin_trap();
 
-    return reinterpret_cast<void *>(OldData);
+    return &Memory[OldData];
   }
 
   void free(void *) {}
@@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator;
 ///
 ///{
 
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
+void *allocator::alloc(uint64_t Size) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  return BumpAllocator.alloc(Size);
+#else
+  return ::malloc(Size);
+#endif
 }
 
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
+void allocator::free(void *Ptr) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  BumpAllocator.free(Ptr);
+#else
+  ::free(Ptr);
+#endif
+}
 
 ///}
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index 8c2828b270419..05af35d242ac5 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
   state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
   workshare::init(IsSPMD);
 }
 
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 563f674d166e5..a53fb4302fdb5 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
   cas...
[truncated]

llvmbot · 2025-10-15T21:12:13Z

@llvm/pr-subscribers-offload

Author: Joseph Huber (jhuber6)

Changes

Summary:
This was a lot of code that was only used for upstream LLVM builds of
AMDGPU offloading. We have a generic and fast malloc in libc now so
just use that. Simplifies code, can be added back if we start providing
alternate forms but I don't think there's a single use-case that would
justify it yet.

Patch is 22.38 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/163629.diff

15 Files Affected:

(modified) offload/include/Shared/Environment.h (-22)
(modified) offload/plugins-nextgen/amdgpu/src/rtl.cpp (-14)
(modified) offload/plugins-nextgen/common/include/PluginInterface.h (+9-15)
(modified) offload/plugins-nextgen/common/src/PluginInterface.cpp (+1-95)
(modified) offload/plugins-nextgen/cuda/src/rtl.cpp (-5)
(modified) offload/plugins-nextgen/host/src/rtl.cpp (-8)
(renamed) offload/test/libc/malloc_parallel.c ()
(modified) offload/test/mapping/lambda_mapping.cpp (+2)
(removed) offload/test/offloading/malloc.c (-37)
(modified) openmp/device/include/Allocator.h (-6)
(modified) openmp/device/src/Allocator.cpp (+34-33)
(modified) openmp/device/src/Kernel.cpp (-1)
(modified) openmp/device/src/Misc.cpp (+2-2)
(modified) openmp/device/src/State.cpp (+2-22)
(modified) openmp/docs/design/Runtimes.rst (-1)

diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h
index 2a283bd6fa4ed..79e45fd8e082d 100644
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
   Assertion = 1U << 0,
   FunctionTracing = 1U << 1,
   CommonIssues = 1U << 2,
-  AllocationTracker = 1U << 3,
   PGODump = 1U << 4,
 };
 
@@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
   uint64_t HardwareParallelism;
 };
 
-struct DeviceMemoryPoolTy {
-  void *Ptr;
-  uint64_t Size;
-};
-
-struct DeviceMemoryPoolTrackingTy {
-  uint64_t NumAllocations;
-  uint64_t AllocationTotal;
-  uint64_t AllocationMin;
-  uint64_t AllocationMax;
-
-  void combine(DeviceMemoryPoolTrackingTy &Other) {
-    NumAllocations += Other.NumAllocations;
-    AllocationTotal += Other.AllocationTotal;
-    AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
-                                                        : AllocationMin;
-    AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
-                                                        : AllocationMax;
-  }
-};
-
 // NOTE: Please don't change the order of those members as their indices are
 // used in the middle end. Always add the new data member at the end.
 // Different from KernelEnvironmentTy below, this structure contains members
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index a7723b8598815..4d827b3ad31e2 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3057,17 +3057,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     StackSize = Value;
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = DeviceMemoryPoolSize;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override {
-    for (DeviceImageTy *Image : LoadedImages)
-      if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
-        return Err;
-    DeviceMemoryPoolSize = Value;
-    return Plugin::success();
-  }
   Error getDeviceMemorySize(uint64_t &Value) override {
     for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
       if (Pool->isGlobal()) {
@@ -3269,9 +3258,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Reference to the host device.
   AMDHostDeviceTy &HostDevice;
 
-  /// The current size of the global device memory pool (managed by us).
-  uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
-
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
   uint64_t StackSize = 16 * 1024 /* 16 KB */;
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index 8c530bba3882c..26723d94199e7 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -816,10 +816,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Error unloadBinary(DeviceImageTy *Image);
   virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
 
-  /// Setup the global device memory pool, if the plugin requires one.
-  Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                              uint64_t PoolSize);
-
   // Setup the RPC server for this device if needed. This may not run on some
   // plugins like the CPU targets. By default, it will not be executed so it is
   // up to the target to override this using the shouldSetupRPCServer function.
@@ -1061,6 +1057,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   virtual Error getDeviceStackSize(uint64_t &V) = 0;
 
+  virtual Error getDeviceHeapSize(uint64_t &V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+  virtual Error setDeviceHeapSize(uint64_t V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+
   /// Returns true if current plugin architecture is an APU
   /// and unified_shared_memory was not requested by the program.
   bool useAutoZeroCopy();
@@ -1149,12 +1154,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
   virtual Error setDeviceStackSize(uint64_t V) = 0;
-  virtual Error getDeviceHeapSize(uint64_t &V) = 0;
-  virtual Error setDeviceHeapSize(uint64_t V) = 0;
-
-  /// Indicate whether the device should setup the global device memory pool. If
-  /// false is return the value on the device will be uninitialized.
-  virtual bool shouldSetupDeviceMemoryPool() const { return true; }
 
   /// Indicate whether or not the device should setup the RPC server. This is
   /// only necessary for unhosted targets like the GPU.
@@ -1175,7 +1174,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   Int32Envar OMPX_DebugKind;
   UInt32Envar OMPX_SharedMemorySize;
   UInt64Envar OMPX_TargetStackSize;
-  UInt64Envar OMPX_TargetHeapSize;
 
   /// Environment flag to set the minimum number of threads we use for a
   /// low-trip count combined loop. Instead of using more threads we increase
@@ -1229,10 +1227,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Internal representation for OMPT device (initialize & finalize)
   std::atomic<bool> OmptInitialized;
 #endif
-
-private:
-  DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
-  DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };
 
 /// Class implementing common functionalities of offload plugins. Each plugin
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index db43cbe49cc2b..b0a4113c2c3ec 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -708,7 +708,7 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
       // Do not initialize the following two envars since they depend on the
       // device initialization. These cannot be consulted until the device is
       // initialized correctly. We initialize them in GenericDeviceTy::init().
-      OMPX_TargetStackSize(), OMPX_TargetHeapSize(),
+      OMPX_TargetStackSize(),
       // By default, the initial number of streams and events is 1.
       OMPX_InitialNumStreams("LIBOMPTARGET_NUM_INITIAL_STREAMS", 1),
       OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
@@ -758,14 +758,6 @@ Error GenericDeviceTy::init(GenericPluginTy &Plugin) {
     return StackSizeEnvarOrErr.takeError();
   OMPX_TargetStackSize = std::move(*StackSizeEnvarOrErr);
 
-  auto HeapSizeEnvarOrErr = UInt64Envar::create(
-      "LIBOMPTARGET_HEAP_SIZE",
-      [this](uint64_t &V) -> Error { return getDeviceHeapSize(V); },
-      [this](uint64_t V) -> Error { return setDeviceHeapSize(V); });
-  if (!HeapSizeEnvarOrErr)
-    return HeapSizeEnvarOrErr.takeError();
-  OMPX_TargetHeapSize = std::move(*HeapSizeEnvarOrErr);
-
   // Update the maximum number of teams and threads after the device
   // initialization sets the corresponding hardware limit.
   if (OMP_NumTeams > 0)
@@ -791,19 +783,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
   if (auto Err = callGlobalDestructors(Plugin, *Image))
     return Err;
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-    DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-    GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                           sizeof(DeviceMemoryPoolTrackingTy),
-                           &ImageDeviceMemoryPoolTracking);
-    if (auto Err =
-            GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
-      consumeError(std::move(Err));
-    }
-    DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
-  }
-
   GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
   auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
   if (!ProfOrErr)
@@ -829,22 +808,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
       return Err;
   LoadedImages.clear();
 
-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    // TODO: Write this by default into a file.
-    printf("\n\n|-----------------------\n"
-           "| Device memory tracker:\n"
-           "|-----------------------\n"
-           "| #Allocations: %lu\n"
-           "| Byes allocated: %lu\n"
-           "| Minimal allocation: %lu\n"
-           "| Maximal allocation: %lu\n"
-           "|-----------------------\n\n\n",
-           DeviceMemoryPoolTracking.NumAllocations,
-           DeviceMemoryPoolTracking.AllocationTotal,
-           DeviceMemoryPoolTracking.AllocationMin,
-           DeviceMemoryPoolTracking.AllocationMax);
-  }
-
   // Delete the memory manager before deinitializing the device. Otherwise,
   // we may delete device allocations after the device is deinitialized.
   if (MemoryManager)
@@ -897,18 +860,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   // Add the image to list.
   LoadedImages.push_back(Image);
 
-  // Setup the global device memory pool if needed.
-  if (!Plugin.getRecordReplay().isReplaying() &&
-      shouldSetupDeviceMemoryPool()) {
-    uint64_t HeapSize;
-    auto SizeOrErr = getDeviceHeapSize(HeapSize);
-    if (SizeOrErr) {
-      REPORT("No global device memory pool due to error: %s\n",
-             toString(std::move(SizeOrErr)).data());
-    } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
-      return std::move(Err);
-  }
-
   if (auto Err = setupRPCServer(Plugin, *Image))
     return std::move(Err);
 
@@ -932,51 +883,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
   return Image;
 }
 
-Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
-                                             DeviceImageTy &Image,
-                                             uint64_t PoolSize) {
-  // Free the old pool, if any.
-  if (DeviceMemoryPool.Ptr) {
-    if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE))
-      return Err;
-  }
-
-  DeviceMemoryPool.Size = PoolSize;
-  auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE);
-  if (AllocOrErr) {
-    DeviceMemoryPool.Ptr = *AllocOrErr;
-  } else {
-    auto Err = AllocOrErr.takeError();
-    REPORT("Failure to allocate device memory for global memory pool: %s\n",
-           toString(std::move(Err)).data());
-    DeviceMemoryPool.Ptr = nullptr;
-    DeviceMemoryPool.Size = 0;
-  }
-
-  // Create the metainfo of the device environment global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (!GHandler.isSymbolInImage(*this, Image,
-                                "__omp_rtl_device_memory_pool_tracker")) {
-    DP("Skip the memory pool as there is no tracker symbol in the image.");
-    return Error::success();
-  }
-
-  GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                         sizeof(DeviceMemoryPoolTrackingTy),
-                         &DeviceMemoryPoolTracking);
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
-    return Err;
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
-                        sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
-
-  // Write device environment values to the device.
-  return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
-}
-
 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
                                       DeviceImageTy &Image) {
   // The plugin either does not need an RPC server or it is unavailable.
diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp
index db94f7f2dd995..3ef724328916a 100644
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1228,11 +1228,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  virtual bool shouldSetupDeviceMemoryPool() const override {
-    /// We use the CUDA malloc for now.
-    return false;
-  }
-
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp
index eb4ecac9907a1..48de1fefa29d6 100644
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     return Info;
   }
 
-  /// This plugin should not setup the device environment or memory pool.
-  virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
-
   /// Getters and setters for stack size and heap size not relevant.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = 0;
@@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   Error setDeviceStackSize(uint64_t Value) override {
     return Plugin::success();
   }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = 0;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
 
 private:
   /// Grid values for Generic ELF64 plugins.
diff --git a/offload/test/offloading/malloc_parallel.c b/offload/test/libc/malloc_parallel.c
similarity index 100%
rename from offload/test/offloading/malloc_parallel.c
rename to offload/test/libc/malloc_parallel.c
diff --git a/offload/test/mapping/lambda_mapping.cpp b/offload/test/mapping/lambda_mapping.cpp
index 63b1719fbbc36..8e640b7fff3aa 100644
--- a/offload/test/mapping/lambda_mapping.cpp
+++ b/offload/test/mapping/lambda_mapping.cpp
@@ -4,6 +4,8 @@
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileoptxx-run-and-check-generic
 
+// REQUIRES: libc
+
 #include <iostream>
 
 template <typename LOOP_BODY>
diff --git a/offload/test/offloading/malloc.c b/offload/test/offloading/malloc.c
deleted file mode 100644
index 7b98e1f1110e5..0000000000000
--- a/offload/test/offloading/malloc.c
+++ /dev/null
@@ -1,37 +0,0 @@
-// RUN: %libomptarget-compile-generic && %libomptarget-run-generic
-// RUN: %libomptarget-compileopt-generic && %libomptarget-run-generic
-
-#include <stdio.h>
-#include <stdlib.h>
-
-int main() {
-  long unsigned *DP = 0;
-  int N = 32;
-  int Threads = 64;
-  int Teams = 10;
-
-  // Allocate ~55MB on the device.
-#pragma omp target map(from : DP)
-  DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
-
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads) is_device_ptr(DP)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      DP[i * N + j] = i + j;
-    }
-  }
-
-  long unsigned s = 0;
-#pragma omp target teams distribute parallel for num_teams(Teams)              \
-    thread_limit(Threads) reduction(+ : s)
-  for (int i = 0; i < Threads * Teams; ++i) {
-    for (int j = 0; j < N; ++j) {
-      s += DP[i * N + j];
-    }
-  }
-
-  // CHECK: Sum: 6860800
-  printf("Sum: %li\n", s);
-  return 0;
-}
diff --git a/openmp/device/include/Allocator.h b/openmp/device/include/Allocator.h
index dc4d029ed75f3..507ec6327126a 100644
--- a/openmp/device/include/Allocator.h
+++ b/openmp/device/include/Allocator.h
@@ -14,18 +14,12 @@
 
 #include "DeviceTypes.h"
 
-// Forward declaration.
-struct KernelEnvironmentTy;
-
 namespace ompx {
 
 namespace allocator {
 
 static uint64_t constexpr ALIGNMENT = 16;
 
-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
 /// Allocate \p Size bytes.
 [[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
 alloc(uint64_t Size);
diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp
index aac2a6005158e..34c945c979ffb 100644
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -18,42 +18,36 @@
 #include "Synchronization.h"
 
 using namespace ompx;
+using namespace allocator;
+
+// Provide a default implementation of malloc / free for AMDGPU platforms built
+// without 'libc' support.
+extern "C" {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
+[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
+#else
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+}
 
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
+static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
+alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};
 
-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
+// Fallback bump pointer interface for platforms without a functioning
+// allocator.
 struct BumpAllocatorTy final {
+  uint64_t Offset = 0;
 
   void *alloc(uint64_t Size) {
     Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
 
-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
+    uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
+    if (OldData + Size >= MEMORY_SIZE)
       __builtin_trap();
 
-    return reinterpret_cast<void *>(OldData);
+    return &Memory[OldData];
   }
 
   void free(void *) {}
@@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator;
 ///
 ///{
 
-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
+void *allocator::alloc(uint64_t Size) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  return BumpAllocator.alloc(Size);
+#else
+  return ::malloc(Size);
+#endif
 }
 
-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
+void allocator::free(void *Ptr) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  BumpAllocator.free(Ptr);
+#else
+  ::free(Ptr);
+#endif
+}
 
 ///}
diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp
index 8c2828b270419..05af35d242ac5 100644
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
   synchronize::init(IsSPMD);
   mapping::init(IsSPMD);
   state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
   workshare::init(IsSPMD);
 }
 
diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp
index 563f674d166e5..a53fb4302fdb5 100644
--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
   cas...
[truncated]

shiltian

Well, I'm not sure what our policy regarding using offload with libc. Is that required now? If not, it is okay for me to have our own implementation. Alternatively, even if we want to remove it, we want to document it somewhere that, if anyone wants to use device side dynamic allocation, they need to enable libc project; otherwise this will not work.

On the other hand, I think even our device runtime requires the support for dynamic allocation for the smart stack when static shared memory allocation runs out. This PR will make it completely not work.

jhuber6 · 2025-10-17T21:08:24Z

Well, I'm not sure what our policy regarding using offload with libc. Is that required now? If not, it is okay for me to have our own implementation. Alternatively, even if we want to remove it, we want to document it somewhere that, if anyone wants to use device side dynamic allocation, they need to enable libc project; otherwise this will not work.

On the other hand, I think even our device runtime requires the support for dynamic allocation for the smart stack when static shared memory allocation runs out. This PR will make it completely not work.

AOMP shipped their own malloc, CUDA uses their own malloc, upstream libc has real malloc, this was never a functional implementation and just made a gigantic bump pointer to cover up the fact that we didn't have it. libc isn't "required", but if you have an application that needs more than a trivial bump pointer for malloc then you should use libc.

shiltian · 2025-10-17T21:09:58Z

How about upstream offload w/o libc?

jhuber6 · 2025-10-17T21:10:51Z

How about upstream offload w/o libc?

"this was never a functional implementation and just made a gigantic bump pointer to cover up the fact that we didn't have it." This patch still provides a bump pointer, it just doesn't support this dynamic resizing stuff. There was a lot of infra here that's just unnecessary since it wraps around a dummy implementation. I think this was intended to be expanded but I don't think that's likely. Using malloc is slow on GPUs and we have a correct one if people need it. I think anything else is unnecessary. If people need a custom malloc they will write one themselves.

shiltian · 2025-10-17T22:03:52Z

offload/test/offloading/malloc.c

-  }
-
-  long unsigned s = 0;
-#pragma omp target teams distribute parallel for num_teams(Teams)              \


I'd tweak this test case such that it can still work with the 1MB memory.

I checked it, and unless I'm mistaken this only uses 160 KiB. Maybe it was reduced because CUDA only supported like one MiB max.

Summary: This was a lot of code that was only used for upstream LLVM builds of AMDGPU offloading. We have a generic and fast `malloc` in `libc` now so just use that. Simplifies code, can be added back if we start providing alternate forms but I don't think there's a single use-case that would justify it yet.

jhuber6 requested review from carlobertolli, jplehr, ronlieb, saiislam, sarnex and shiltian October 15, 2025 21:11

llvmbot added backend:AMDGPU openmp:libomp OpenMP host runtime openmp:libomptarget OpenMP offload runtime offload labels Oct 15, 2025

jhuber6 force-pushed the RemoveAllocator branch from 665bd6c to 676205c Compare October 15, 2025 21:16

shiltian reviewed Oct 17, 2025

View reviewed changes

jhuber6 added 2 commits October 17, 2025 17:05

re-add test

52f33db

jhuber6 force-pushed the RemoveAllocator branch from 676205c to 52f33db Compare October 17, 2025 22:12

shiltian approved these changes Oct 17, 2025

View reviewed changes

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[Offload] Remove handling for device memory pool #163629

[Offload] Remove handling for device memory pool #163629

jhuber6 commented Oct 15, 2025

Uh oh!

llvmbot commented Oct 15, 2025

Uh oh!

llvmbot commented Oct 15, 2025

Uh oh!

shiltian left a comment

Uh oh!

jhuber6 commented Oct 17, 2025

Uh oh!

shiltian commented Oct 17, 2025

Uh oh!

jhuber6 commented Oct 17, 2025 •

edited

Loading

Uh oh!

shiltian Oct 17, 2025

Uh oh!

jhuber6 Oct 17, 2025

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

[Offload] Remove handling for device memory pool #163629

Are you sure you want to change the base?

[Offload] Remove handling for device memory pool #163629

Conversation

jhuber6 commented Oct 15, 2025

Uh oh!

llvmbot commented Oct 15, 2025

Uh oh!

llvmbot commented Oct 15, 2025

Uh oh!

shiltian left a comment

Choose a reason for hiding this comment

Uh oh!

jhuber6 commented Oct 17, 2025

Uh oh!

shiltian commented Oct 17, 2025

Uh oh!

jhuber6 commented Oct 17, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

shiltian Oct 17, 2025

Choose a reason for hiding this comment

Uh oh!

jhuber6 Oct 17, 2025

Choose a reason for hiding this comment

Uh oh!

Reviewers

Assignees

Labels

Projects

Milestone

Development

Uh oh!

3 participants

jhuber6 commented Oct 17, 2025 •

edited

Loading