diff --git a/offload/include/Shared/Environment.h b/offload/include/Shared/Environment.h index 2a283bd6fa4ed..79e45fd8e082d 100644 --- a/offload/include/Shared/Environment.h +++ b/offload/include/Shared/Environment.h @@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t { Assertion = 1U << 0, FunctionTracing = 1U << 1, CommonIssues = 1U << 2, - AllocationTracker = 1U << 3, PGODump = 1U << 4, }; @@ -36,27 +35,6 @@ struct DeviceEnvironmentTy { uint64_t HardwareParallelism; }; -struct DeviceMemoryPoolTy { - void *Ptr; - uint64_t Size; -}; - -struct DeviceMemoryPoolTrackingTy { - uint64_t NumAllocations; - uint64_t AllocationTotal; - uint64_t AllocationMin; - uint64_t AllocationMax; - - void combine(DeviceMemoryPoolTrackingTy &Other) { - NumAllocations += Other.NumAllocations; - AllocationTotal += Other.AllocationTotal; - AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin - : AllocationMin; - AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax - : AllocationMax; - } -}; - // NOTE: Please don't change the order of those members as their indices are // used in the middle end. Always add the new data member at the end. // Different from KernelEnvironmentTy below, this structure contains members diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp index a7723b8598815..4d827b3ad31e2 100644 --- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp +++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp @@ -3057,17 +3057,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { StackSize = Value; return Plugin::success(); } - Error getDeviceHeapSize(uint64_t &Value) override { - Value = DeviceMemoryPoolSize; - return Plugin::success(); - } - Error setDeviceHeapSize(uint64_t Value) override { - for (DeviceImageTy *Image : LoadedImages) - if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value)) - return Err; - DeviceMemoryPoolSize = Value; - return Plugin::success(); - } Error getDeviceMemorySize(uint64_t &Value) override { for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) { if (Pool->isGlobal()) { @@ -3269,9 +3258,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy { /// Reference to the host device. AMDHostDeviceTy &HostDevice; - /// The current size of the global device memory pool (managed by us). - uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/; - /// The current size of the stack that will be used in cases where it could /// not be statically determined. uint64_t StackSize = 16 * 1024 /* 16 KB */; diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h index 8c530bba3882c..3a108a9859d72 100644 --- a/offload/plugins-nextgen/common/include/PluginInterface.h +++ b/offload/plugins-nextgen/common/include/PluginInterface.h @@ -816,10 +816,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { Error unloadBinary(DeviceImageTy *Image); virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0; - /// Setup the global device memory pool, if the plugin requires one. - Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image, - uint64_t PoolSize); - // Setup the RPC server for this device if needed. This may not run on some // plugins like the CPU targets. By default, it will not be executed so it is // up to the target to override this using the shouldSetupRPCServer function. @@ -1061,6 +1057,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy { virtual Error getDeviceStackSize(uint64_t &V) = 0; + virtual Error getDeviceHeapSize(uint64_t &V) { + return Plugin::error(error::ErrorCode::UNSUPPORTED, + "%s not supported by platform", __func__); + } + virtual Error setDeviceHeapSize(uint64_t V) { + return Plugin::error(error::ErrorCode::UNSUPPORTED, + "%s not supported by platform", __func__); + } + /// Returns true if current plugin architecture is an APU /// and unified_shared_memory was not requested by the program. bool useAutoZeroCopy(); @@ -1149,12 +1154,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// plugin can implement the setters as no-op and setting the output /// value to zero for the getters. virtual Error setDeviceStackSize(uint64_t V) = 0; - virtual Error getDeviceHeapSize(uint64_t &V) = 0; - virtual Error setDeviceHeapSize(uint64_t V) = 0; - - /// Indicate whether the device should setup the global device memory pool. If - /// false is return the value on the device will be uninitialized. - virtual bool shouldSetupDeviceMemoryPool() const { return true; } /// Indicate whether or not the device should setup the RPC server. This is /// only necessary for unhosted targets like the GPU. @@ -1229,10 +1228,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy { /// Internal representation for OMPT device (initialize & finalize) std::atomic OmptInitialized; #endif - -private: - DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0}; - DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0}; }; /// Class implementing common functionalities of offload plugins. Each plugin diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index db43cbe49cc2b..220a9570864ba 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -791,19 +791,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) { if (auto Err = callGlobalDestructors(Plugin, *Image)) return Err; - if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0}; - GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", - sizeof(DeviceMemoryPoolTrackingTy), - &ImageDeviceMemoryPoolTracking); - if (auto Err = - GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) { - consumeError(std::move(Err)); - } - DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking); - } - GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image); if (!ProfOrErr) @@ -829,22 +816,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { return Err; LoadedImages.clear(); - if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) { - // TODO: Write this by default into a file. - printf("\n\n|-----------------------\n" - "| Device memory tracker:\n" - "|-----------------------\n" - "| #Allocations: %lu\n" - "| Byes allocated: %lu\n" - "| Minimal allocation: %lu\n" - "| Maximal allocation: %lu\n" - "|-----------------------\n\n\n", - DeviceMemoryPoolTracking.NumAllocations, - DeviceMemoryPoolTracking.AllocationTotal, - DeviceMemoryPoolTracking.AllocationMin, - DeviceMemoryPoolTracking.AllocationMax); - } - // Delete the memory manager before deinitializing the device. Otherwise, // we may delete device allocations after the device is deinitialized. if (MemoryManager) @@ -897,18 +868,6 @@ Expected GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, // Add the image to list. LoadedImages.push_back(Image); - // Setup the global device memory pool if needed. - if (!Plugin.getRecordReplay().isReplaying() && - shouldSetupDeviceMemoryPool()) { - uint64_t HeapSize; - auto SizeOrErr = getDeviceHeapSize(HeapSize); - if (SizeOrErr) { - REPORT("No global device memory pool due to error: %s\n", - toString(std::move(SizeOrErr)).data()); - } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize)) - return std::move(Err); - } - if (auto Err = setupRPCServer(Plugin, *Image)) return std::move(Err); @@ -932,51 +891,6 @@ Expected GenericDeviceTy::loadBinary(GenericPluginTy &Plugin, return Image; } -Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin, - DeviceImageTy &Image, - uint64_t PoolSize) { - // Free the old pool, if any. - if (DeviceMemoryPool.Ptr) { - if (auto Err = dataDelete(DeviceMemoryPool.Ptr, - TargetAllocTy::TARGET_ALLOC_DEVICE)) - return Err; - } - - DeviceMemoryPool.Size = PoolSize; - auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr, - TargetAllocTy::TARGET_ALLOC_DEVICE); - if (AllocOrErr) { - DeviceMemoryPool.Ptr = *AllocOrErr; - } else { - auto Err = AllocOrErr.takeError(); - REPORT("Failure to allocate device memory for global memory pool: %s\n", - toString(std::move(Err)).data()); - DeviceMemoryPool.Ptr = nullptr; - DeviceMemoryPool.Size = 0; - } - - // Create the metainfo of the device environment global. - GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler(); - if (!GHandler.isSymbolInImage(*this, Image, - "__omp_rtl_device_memory_pool_tracker")) { - DP("Skip the memory pool as there is no tracker symbol in the image."); - return Error::success(); - } - - GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker", - sizeof(DeviceMemoryPoolTrackingTy), - &DeviceMemoryPoolTracking); - if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal)) - return Err; - - // Create the metainfo of the device environment global. - GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool", - sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool); - - // Write device environment values to the device. - return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal); -} - Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image) { // The plugin either does not need an RPC server or it is unavailable. diff --git a/offload/plugins-nextgen/cuda/src/rtl.cpp b/offload/plugins-nextgen/cuda/src/rtl.cpp index db94f7f2dd995..3ef724328916a 100644 --- a/offload/plugins-nextgen/cuda/src/rtl.cpp +++ b/offload/plugins-nextgen/cuda/src/rtl.cpp @@ -1228,11 +1228,6 @@ struct CUDADeviceTy : public GenericDeviceTy { return Info; } - virtual bool shouldSetupDeviceMemoryPool() const override { - /// We use the CUDA malloc for now. - return false; - } - /// Getters and setters for stack and heap sizes. Error getDeviceStackSize(uint64_t &Value) override { return getCtxLimit(CU_LIMIT_STACK_SIZE, Value); diff --git a/offload/plugins-nextgen/host/src/rtl.cpp b/offload/plugins-nextgen/host/src/rtl.cpp index eb4ecac9907a1..48de1fefa29d6 100644 --- a/offload/plugins-nextgen/host/src/rtl.cpp +++ b/offload/plugins-nextgen/host/src/rtl.cpp @@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { return Info; } - /// This plugin should not setup the device environment or memory pool. - virtual bool shouldSetupDeviceMemoryPool() const override { return false; }; - /// Getters and setters for stack size and heap size not relevant. Error getDeviceStackSize(uint64_t &Value) override { Value = 0; @@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy { Error setDeviceStackSize(uint64_t Value) override { return Plugin::success(); } - Error getDeviceHeapSize(uint64_t &Value) override { - Value = 0; - return Plugin::success(); - } - Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); } private: /// Grid values for Generic ELF64 plugins. diff --git a/offload/test/offloading/malloc_parallel.c b/offload/test/libc/malloc_parallel.c similarity index 100% rename from offload/test/offloading/malloc_parallel.c rename to offload/test/libc/malloc_parallel.c diff --git a/offload/test/mapping/lambda_mapping.cpp b/offload/test/mapping/lambda_mapping.cpp index 63b1719fbbc36..8e640b7fff3aa 100644 --- a/offload/test/mapping/lambda_mapping.cpp +++ b/offload/test/mapping/lambda_mapping.cpp @@ -4,6 +4,8 @@ // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic // RUN: %libomptarget-compileoptxx-run-and-check-generic +// REQUIRES: libc + #include template diff --git a/offload/test/offloading/malloc.c b/offload/test/offloading/malloc.c index 7b98e1f1110e5..04e72561d3127 100644 --- a/offload/test/offloading/malloc.c +++ b/offload/test/offloading/malloc.c @@ -10,7 +10,7 @@ int main() { int Threads = 64; int Teams = 10; - // Allocate ~55MB on the device. + // Allocate ~160 KiB on the device. #pragma omp target map(from : DP) DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams); diff --git a/openmp/device/include/Allocator.h b/openmp/device/include/Allocator.h index dc4d029ed75f3..507ec6327126a 100644 --- a/openmp/device/include/Allocator.h +++ b/openmp/device/include/Allocator.h @@ -14,18 +14,12 @@ #include "DeviceTypes.h" -// Forward declaration. -struct KernelEnvironmentTy; - namespace ompx { namespace allocator { static uint64_t constexpr ALIGNMENT = 16; -/// Initialize the allocator according to \p KernelEnvironment -void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment); - /// Allocate \p Size bytes. [[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void * alloc(uint64_t Size); diff --git a/openmp/device/src/Allocator.cpp b/openmp/device/src/Allocator.cpp index aac2a6005158e..34c945c979ffb 100644 --- a/openmp/device/src/Allocator.cpp +++ b/openmp/device/src/Allocator.cpp @@ -18,42 +18,36 @@ #include "Synchronization.h" using namespace ompx; +using namespace allocator; + +// Provide a default implementation of malloc / free for AMDGPU platforms built +// without 'libc' support. +extern "C" { +#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) +[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } +[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } +#else +[[gnu::leaf]] void *malloc(size_t Size); +[[gnu::leaf]] void free(void *Ptr); +#endif +} -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility( - "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool; -[[gnu::used, gnu::retain, gnu::weak, - gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy - __omp_rtl_device_memory_pool_tracker; +static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024; +alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0}; -/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool -/// directly. +// Fallback bump pointer interface for platforms without a functioning +// allocator. struct BumpAllocatorTy final { + uint64_t Offset = 0; void *alloc(uint64_t Size) { Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT)); - if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) { - atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1, - atomic::seq_cst); - atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size, - atomic::seq_cst); - atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size, - atomic::seq_cst); - atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size, - atomic::seq_cst); - } - - uint64_t *Data = - reinterpret_cast(&__omp_rtl_device_memory_pool.Ptr); - uint64_t End = - reinterpret_cast(Data) + __omp_rtl_device_memory_pool.Size; - - uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst); - if (OldData + Size > End) + uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst); + if (OldData + Size >= MEMORY_SIZE) __builtin_trap(); - return reinterpret_cast(OldData); + return &Memory[OldData]; } void free(void *) {} @@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator; /// ///{ -void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) { - // TODO: Check KernelEnvironment for an allocator choice as soon as we have - // more than one. +void *allocator::alloc(uint64_t Size) { +#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) + return BumpAllocator.alloc(Size); +#else + return ::malloc(Size); +#endif } -void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); } - -void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); } +void allocator::free(void *Ptr) { +#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) + BumpAllocator.free(Ptr); +#else + ::free(Ptr); +#endif +} ///} diff --git a/openmp/device/src/Kernel.cpp b/openmp/device/src/Kernel.cpp index 8c2828b270419..05af35d242ac5 100644 --- a/openmp/device/src/Kernel.cpp +++ b/openmp/device/src/Kernel.cpp @@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment, synchronize::init(IsSPMD); mapping::init(IsSPMD); state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment); - allocator::init(IsSPMD, KernelEnvironment); workshare::init(IsSPMD); } diff --git a/openmp/device/src/Misc.cpp b/openmp/device/src/Misc.cpp index 563f674d166e5..a53fb4302fdb5 100644 --- a/openmp/device/src/Misc.cpp +++ b/openmp/device/src/Misc.cpp @@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) { case omp_const_mem_alloc: case omp_high_bw_mem_alloc: case omp_low_lat_mem_alloc: - return malloc(size); + return ompx::allocator::alloc(size); default: return nullptr; } @@ -113,7 +113,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) { case omp_const_mem_alloc: case omp_high_bw_mem_alloc: case omp_low_lat_mem_alloc: - free(ptr); + ompx::allocator::free(ptr); return; case omp_null_allocator: default: diff --git a/openmp/device/src/State.cpp b/openmp/device/src/State.cpp index 475395102f47b..9f38cf26f8c6f 100644 --- a/openmp/device/src/State.cpp +++ b/openmp/device/src/State.cpp @@ -44,26 +44,6 @@ using namespace ompx; namespace { -/// Fallback implementations are missing to trigger a link time error. -/// Implementations for new devices, including the host, should go into a -/// dedicated begin/end declare variant. -/// -///{ -extern "C" { -#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC) - -[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); } -[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); } - -#else - -[[gnu::weak, gnu::leaf]] void *malloc(size_t Size); -[[gnu::weak, gnu::leaf]] void free(void *Ptr); - -#endif -} -///} - /// A "smart" stack in shared memory. /// /// The stack exposes a malloc/free interface but works like a stack internally. @@ -171,13 +151,13 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) { } void *memory::allocGlobal(uint64_t Bytes, const char *Reason) { - void *Ptr = malloc(Bytes); + void *Ptr = allocator::alloc(Bytes); if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr) printf("nullptr returned by malloc!\n"); return Ptr; } -void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); } +void memory::freeGlobal(void *Ptr, const char *Reason) { allocator::free(Ptr); } ///} diff --git a/openmp/docs/design/Runtimes.rst b/openmp/docs/design/Runtimes.rst index cd78a5ba88e2c..1b6f30ae73a33 100644 --- a/openmp/docs/design/Runtimes.rst +++ b/openmp/docs/design/Runtimes.rst @@ -1521,5 +1521,4 @@ debugging features are supported. * Enable debugging assertions in the device. ``0x01`` * Enable diagnosing common problems during offloading . ``0x4`` - * Enable device malloc statistics (amdgpu only). ``0x8`` * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``