Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions offload/include/Shared/APITypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,9 @@ struct KernelArgsTy {
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
uint64_t Unused : 62;
} Flags = {0, 0, 0};
uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
uint64_t Unused : 60;
} Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension).
Expand Down
17 changes: 16 additions & 1 deletion offload/include/Shared/Environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,10 +92,25 @@ struct KernelEnvironmentTy {
DynamicEnvironmentTy *DynamicEnv = nullptr;
};

/// The fallback types for the dynamic cgroup memory.
enum class DynCGroupMemFallbackType : unsigned char {
/// None. Used for indicating that no fallback was triggered.
None = 0,
/// Abort the execution.
Abort = None,
/// Return null pointer.
Null = 1,
/// Allocate from a implementation defined memory space.
DefaultMem = 2
};

struct KernelLaunchEnvironmentTy {
void *ReductionBuffer = nullptr;
void *DynCGroupMemFbPtr = nullptr;
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
void *ReductionBuffer = nullptr;
uint32_t DynCGroupMemSize = 0;
DynCGroupMemFallbackType DynCGroupMemFb = DynCGroupMemFallbackType::None;
};

#endif // OMPTARGET_SHARED_ENVIRONMENT_H
3 changes: 3 additions & 0 deletions offload/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,9 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; }

/// Get the maximum shared memory per team for any kernel.
uint64_t getMaxSharedTeamMemory();

private:
/// Deinitialize the device (and plugin).
void deinit();
Expand Down
14 changes: 13 additions & 1 deletion offload/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ enum TargetAllocTy : int32_t {

inline KernelArgsTy CTorDTorKernelArgs = {
1, 0, nullptr, nullptr, nullptr, nullptr, nullptr,
nullptr, 0, {0, 0, 0}, {1, 0, 0}, {1, 0, 0}, 0};
nullptr, 0, {0, 0, 0, 0}, {1, 0, 0}, {1, 0, 0}, 0};

struct DeviceTy;

Expand Down Expand Up @@ -271,10 +271,22 @@ struct __tgt_target_non_contig {
extern "C" {
#endif

/// The OpenMP access group type. The criterion for grupping tasks using a
/// specific grouping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
/// Groups the tasks based on the parallel region to which they bind.
omp_access_pteam = 1,
};

void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
size_t
omp_get_groupprivate_limit(int device_num,
omp_access_t access_group = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
Expand Down
14 changes: 14 additions & 0 deletions offload/libomptarget/OpenMP/API.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,20 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice;
}

EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
omp_access_t AccessGroup) {
TIMESCOPE();
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (DeviceNum == omp_get_initial_device())
return 0;

auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());

return DeviceOrErr->getMaxSharedTeamMemory();
}

EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size));
Expand Down
6 changes: 6 additions & 0 deletions offload/libomptarget/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -371,3 +371,9 @@ bool DeviceTy::useAutoZeroCopy() {
bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size);
}

uint64_t DeviceTy::getMaxSharedTeamMemory() {
using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
return RTL->query_device_info(
RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
}
1 change: 1 addition & 0 deletions offload/libomptarget/exports
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ VERS1.0 {
omp_get_num_devices;
omp_get_device_num;
omp_get_initial_device;
omp_get_groupprivate_limit;
omp_target_alloc;
omp_target_free;
omp_target_is_accessible;
Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ typedef enum {
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
} hsa_amd_memory_pool_info_t;

typedef enum {
Expand Down
38 changes: 26 additions & 12 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -277,7 +277,6 @@ struct AMDGPUMemoryPoolTy {

if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;

return Plugin::success();
}

Expand Down Expand Up @@ -548,6 +547,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err;
}

StaticBlockMemSize = GroupSize;

// Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error(ErrorCode::INVALID_BINARY,
Expand All @@ -571,8 +572,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {

/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

/// Return maximum block size for maximum occupancy
Expand Down Expand Up @@ -2172,6 +2173,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = checkIfAPU())
return Err;

// Retrieve the size of the group memory.
for (const auto *Pool : AllMemoryPools) {
if (Pool->isGroup()) {
size_t Size = 0;
if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
return Err;
MaxBlockSharedMemSize = Size;
break;
}
}

// Supports block shared memory natively.
HasNativeBlockSharedMem = true;

return Plugin::success();
}

Expand Down Expand Up @@ -3177,7 +3192,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl(
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
*this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;

Expand Down Expand Up @@ -3712,6 +3727,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {

Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
Expand All @@ -3724,13 +3740,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err;

// Account for user requested dynamic shared memory.
uint32_t GroupSize = getGroupSize();
if (uint32_t MaxDynCGroupMem = std::max(
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
GroupSize += MaxDynCGroupMem;
}

uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
Expand Down Expand Up @@ -3782,9 +3791,14 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem);
}

// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());

// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, StackSize, ArgsMemoryManager);
getStaticBlockMemSize() + DynBlockMemSize,
StackSize, ArgsMemoryManager);
}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Expand Down
34 changes: 30 additions & 4 deletions offload/plugins-nextgen/common/include/PluginInterface.h
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,10 @@ struct InfoTreeNode {
}
};

enum class DeviceQueryKind {
DEVICE_QUERY_MAX_SHARED_TEAM_MEM = 0,
};

/// Class wrapping a __tgt_device_image and its offload entry table on a
/// specific device. This class is responsible for storing and managing
/// the offload entries for an image on a device.
Expand Down Expand Up @@ -361,7 +365,7 @@ struct GenericKernelTy {
AsyncInfoWrapperTy &AsyncInfoWrapper) const;
virtual Error launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
KernelArgsTy &KernelArgs,
uint32_t DynBlockMemSize, KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const = 0;

Expand All @@ -371,6 +375,9 @@ struct GenericKernelTy {
/// Get the kernel name.
const char *getName() const { return Name.c_str(); }

/// Get the size of the static per-block memory consumed by the kernel.
uint32_t getStaticBlockMemSize() const { return StaticBlockMemSize; };

/// Get the kernel image.
DeviceImageTy &getImage() const {
assert(ImagePtr && "Kernel is not initialized!");
Expand All @@ -383,9 +390,10 @@ struct GenericKernelTy {
}

/// Return a device pointer to a new kernel launch environment.
Expected<KernelLaunchEnvironmentTy *>
getKernelLaunchEnvironment(GenericDeviceTy &GenericDevice, uint32_t Version,
AsyncInfoWrapperTy &AsyncInfo) const;
Expected<KernelLaunchEnvironmentTy *> getKernelLaunchEnvironment(
GenericDeviceTy &GenericDevice, const KernelArgsTy &KernelArgs,
uint32_t BlockMemSize, DynCGroupMemFallbackType DynBlockMemFb,
void *DynBlockMemFbPtr, AsyncInfoWrapperTy &AsyncInfoWrapper) const;

/// Indicate whether an execution mode is valid.
static bool isValidExecutionMode(OMPTgtExecModeFlags ExecutionMode) {
Expand Down Expand Up @@ -485,6 +493,9 @@ struct GenericKernelTy {
/// The maximum number of threads which the kernel could leverage.
uint32_t MaxNumThreads;

/// The static memory sized per block.
uint32_t StaticBlockMemSize = 0;

/// The kernel environment, including execution flags.
KernelEnvironmentTy KernelEnvironment;

Expand Down Expand Up @@ -791,6 +802,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
/// this id is not unique between different plugins; they may overlap.
int32_t getDeviceId() const { return DeviceId; }

/// Get the total shared memory per block that can be used in any kernel.
uint32_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }

/// Indicate whether the device has native block shared memory.
bool hasNativeBlockSharedMem() const { return HasNativeBlockSharedMem; }

/// Set the context of the device if needed, before calling device-specific
/// functions. Plugins may implement this function as a no-op if not needed.
virtual Error setContext() = 0;
Expand Down Expand Up @@ -1238,6 +1255,12 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
std::atomic<bool> OmptInitialized;
#endif

/// The total per-block shared memory that a kernel may use.
uint32_t MaxBlockSharedMemSize = 0;

/// Whether the device has native block shared memory.
bool HasNativeBlockSharedMem = false;

private:
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
Expand Down Expand Up @@ -1471,6 +1494,9 @@ struct GenericPluginTy {
/// Prints information about the given devices supported by the plugin.
void print_device_info(int32_t DeviceId);

/// Retrieve information about the given device.
int64_t query_device_info(int32_t DeviceId, DeviceQueryKind Query);

/// Creates an event in the given plugin if supported.
int32_t create_event(int32_t DeviceId, void **EventPtr);

Expand Down
Loading
Loading