Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 0 additions & 22 deletions offload/include/Shared/Environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
Assertion = 1U << 0,
FunctionTracing = 1U << 1,
CommonIssues = 1U << 2,
AllocationTracker = 1U << 3,
PGODump = 1U << 4,
};

Expand All @@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
uint64_t HardwareParallelism;
};

struct DeviceMemoryPoolTy {
void *Ptr;
uint64_t Size;
};

struct DeviceMemoryPoolTrackingTy {
uint64_t NumAllocations;
uint64_t AllocationTotal;
uint64_t AllocationMin;
uint64_t AllocationMax;

void combine(DeviceMemoryPoolTrackingTy &Other) {
NumAllocations += Other.NumAllocations;
AllocationTotal += Other.AllocationTotal;
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
: AllocationMin;
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
: AllocationMax;
}
};

// NOTE: Please don't change the order of those members as their indices are
// used in the middle end. Always add the new data member at the end.
// Different from KernelEnvironmentTy below, this structure contains members
Expand Down
14 changes: 0 additions & 14 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3057,17 +3057,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
StackSize = Value;
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
Value = DeviceMemoryPoolSize;
return Plugin::success();
}
Error setDeviceHeapSize(uint64_t Value) override {
for (DeviceImageTy *Image : LoadedImages)
if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
return Err;
DeviceMemoryPoolSize = Value;
return Plugin::success();
}
Error getDeviceMemorySize(uint64_t &Value) override {
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
if (Pool->isGlobal()) {
Expand Down Expand Up @@ -3269,9 +3258,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
/// Reference to the host device.
AMDHostDeviceTy &HostDevice;

/// The current size of the global device memory pool (managed by us).
uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;

/// The current size of the stack that will be used in cases where it could
/// not be statically determined.
uint64_t StackSize = 16 * 1024 /* 16 KB */;
Expand Down
23 changes: 9 additions & 14 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -816,10 +816,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
Error unloadBinary(DeviceImageTy *Image);
virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;

/// Setup the global device memory pool, if the plugin requires one.
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
uint64_t PoolSize);

// Setup the RPC server for this device if needed. This may not run on some
// plugins like the CPU targets. By default, it will not be executed so it is
// up to the target to override this using the shouldSetupRPCServer function.
Expand Down Expand Up @@ -1061,6 +1057,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {

virtual Error getDeviceStackSize(uint64_t &V) = 0;

virtual Error getDeviceHeapSize(uint64_t &V) {
return Plugin::error(error::ErrorCode::UNSUPPORTED,
"%s not supported by platform", __func__);
}
virtual Error setDeviceHeapSize(uint64_t V) {
return Plugin::error(error::ErrorCode::UNSUPPORTED,
"%s not supported by platform", __func__);
}

/// Returns true if current plugin architecture is an APU
/// and unified_shared_memory was not requested by the program.
bool useAutoZeroCopy();
Expand Down Expand Up @@ -1149,12 +1154,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// plugin can implement the setters as no-op and setting the output
/// value to zero for the getters.
virtual Error setDeviceStackSize(uint64_t V) = 0;
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
virtual Error setDeviceHeapSize(uint64_t V) = 0;

/// Indicate whether the device should setup the global device memory pool. If
/// false is return the value on the device will be uninitialized.
virtual bool shouldSetupDeviceMemoryPool() const { return true; }

/// Indicate whether or not the device should setup the RPC server. This is
/// only necessary for unhosted targets like the GPU.
Expand Down Expand Up @@ -1229,10 +1228,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// Internal representation for OMPT device (initialize & finalize)
std::atomic<bool> OmptInitialized;
#endif

private:
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
};

/// Class implementing common functionalities of offload plugins. Each plugin
Expand Down
86 changes: 0 additions & 86 deletions offload/plugins-nextgen/common/src/PluginInterface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -791,19 +791,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
if (auto Err = callGlobalDestructors(Plugin, *Image))
return Err;

if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
sizeof(DeviceMemoryPoolTrackingTy),
&ImageDeviceMemoryPoolTracking);
if (auto Err =
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
consumeError(std::move(Err));
}
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
}

GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
if (!ProfOrErr)
Expand All @@ -829,22 +816,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
return Err;
LoadedImages.clear();

if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
// TODO: Write this by default into a file.
printf("\n\n|-----------------------\n"
"| Device memory tracker:\n"
"|-----------------------\n"
"| #Allocations: %lu\n"
"| Byes allocated: %lu\n"
"| Minimal allocation: %lu\n"
"| Maximal allocation: %lu\n"
"|-----------------------\n\n\n",
DeviceMemoryPoolTracking.NumAllocations,
DeviceMemoryPoolTracking.AllocationTotal,
DeviceMemoryPoolTracking.AllocationMin,
DeviceMemoryPoolTracking.AllocationMax);
}

// Delete the memory manager before deinitializing the device. Otherwise,
// we may delete device allocations after the device is deinitialized.
if (MemoryManager)
Expand Down Expand Up @@ -897,18 +868,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
// Add the image to list.
LoadedImages.push_back(Image);

// Setup the global device memory pool if needed.
if (!Plugin.getRecordReplay().isReplaying() &&
shouldSetupDeviceMemoryPool()) {
uint64_t HeapSize;
auto SizeOrErr = getDeviceHeapSize(HeapSize);
if (SizeOrErr) {
REPORT("No global device memory pool due to error: %s\n",
toString(std::move(SizeOrErr)).data());
} else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
return std::move(Err);
}

if (auto Err = setupRPCServer(Plugin, *Image))
return std::move(Err);

Expand All @@ -932,51 +891,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
return Image;
}

Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
DeviceImageTy &Image,
uint64_t PoolSize) {
// Free the old pool, if any.
if (DeviceMemoryPool.Ptr) {
if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
TargetAllocTy::TARGET_ALLOC_DEVICE))
return Err;
}

DeviceMemoryPool.Size = PoolSize;
auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
TargetAllocTy::TARGET_ALLOC_DEVICE);
if (AllocOrErr) {
DeviceMemoryPool.Ptr = *AllocOrErr;
} else {
auto Err = AllocOrErr.takeError();
REPORT("Failure to allocate device memory for global memory pool: %s\n",
toString(std::move(Err)).data());
DeviceMemoryPool.Ptr = nullptr;
DeviceMemoryPool.Size = 0;
}

// Create the metainfo of the device environment global.
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
if (!GHandler.isSymbolInImage(*this, Image,
"__omp_rtl_device_memory_pool_tracker")) {
DP("Skip the memory pool as there is no tracker symbol in the image.");
return Error::success();
}

GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
sizeof(DeviceMemoryPoolTrackingTy),
&DeviceMemoryPoolTracking);
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
return Err;

// Create the metainfo of the device environment global.
GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);

// Write device environment values to the device.
return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
}

Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
DeviceImageTy &Image) {
// The plugin either does not need an RPC server or it is unavailable.
Expand Down
5 changes: 0 additions & 5 deletions offload/plugins-nextgen/cuda/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1228,11 +1228,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
return Info;
}

virtual bool shouldSetupDeviceMemoryPool() const override {
/// We use the CUDA malloc for now.
return false;
}

/// Getters and setters for stack and heap sizes.
Error getDeviceStackSize(uint64_t &Value) override {
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
Expand Down
8 changes: 0 additions & 8 deletions offload/plugins-nextgen/host/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
return Info;
}

/// This plugin should not setup the device environment or memory pool.
virtual bool shouldSetupDeviceMemoryPool() const override { return false; };

/// Getters and setters for stack size and heap size not relevant.
Error getDeviceStackSize(uint64_t &Value) override {
Value = 0;
Expand All @@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
Error setDeviceStackSize(uint64_t Value) override {
return Plugin::success();
}
Error getDeviceHeapSize(uint64_t &Value) override {
Value = 0;
return Plugin::success();
}
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }

private:
/// Grid values for Generic ELF64 plugins.
Expand Down
2 changes: 2 additions & 0 deletions offload/test/mapping/lambda_mapping.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
// RUN: %libomptarget-compileoptxx-run-and-check-generic

// REQUIRES: libc

#include <iostream>

template <typename LOOP_BODY>
Expand Down
2 changes: 1 addition & 1 deletion offload/test/offloading/malloc.c
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ int main() {
int Threads = 64;
int Teams = 10;

// Allocate ~55MB on the device.
// Allocate ~160 KiB on the device.
#pragma omp target map(from : DP)
DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);

Expand Down
6 changes: 0 additions & 6 deletions openmp/device/include/Allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,12 @@

#include "DeviceTypes.h"

// Forward declaration.
struct KernelEnvironmentTy;

namespace ompx {

namespace allocator {

static uint64_t constexpr ALIGNMENT = 16;

/// Initialize the allocator according to \p KernelEnvironment
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);

/// Allocate \p Size bytes.
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
alloc(uint64_t Size);
Expand Down
67 changes: 34 additions & 33 deletions openmp/device/src/Allocator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,42 +18,36 @@
#include "Synchronization.h"

using namespace ompx;
using namespace allocator;

// Provide a default implementation of malloc / free for AMDGPU platforms built
// without 'libc' support.
extern "C" {
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
#else
[[gnu::leaf]] void *malloc(size_t Size);
[[gnu::leaf]] void free(void *Ptr);
#endif
}

[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility(
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
[[gnu::used, gnu::retain, gnu::weak,
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
__omp_rtl_device_memory_pool_tracker;
static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};

/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
/// directly.
// Fallback bump pointer interface for platforms without a functioning
// allocator.
struct BumpAllocatorTy final {
uint64_t Offset = 0;

void *alloc(uint64_t Size) {
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));

if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
atomic::seq_cst);
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
atomic::seq_cst);
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
atomic::seq_cst);
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
atomic::seq_cst);
}

uint64_t *Data =
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
uint64_t End =
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;

uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
if (OldData + Size > End)
uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
if (OldData + Size >= MEMORY_SIZE)
__builtin_trap();

return reinterpret_cast<void *>(OldData);
return &Memory[OldData];
}

void free(void *) {}
Expand All @@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator;
///
///{

void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
// more than one.
void *allocator::alloc(uint64_t Size) {
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
return BumpAllocator.alloc(Size);
#else
return ::malloc(Size);
#endif
}

void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }

void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
void allocator::free(void *Ptr) {
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
BumpAllocator.free(Ptr);
#else
::free(Ptr);
#endif
}

///}
1 change: 0 additions & 1 deletion openmp/device/src/Kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
allocator::init(IsSPMD, KernelEnvironment);
workshare::init(IsSPMD);
}

Expand Down
Loading
Loading