Skip to content

[OpenMP][Offload] Add offload runtime support for dyn_groupprivate clause #152831

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: users/kevinsala/omp-dyn-groupprivate-codegen-pr
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions offload/DeviceRTL/include/DeviceTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,4 +163,13 @@ typedef enum omp_allocator_handle_t {

///}

/// The OpenMP access group type. The criterion for grupping tasks using a
/// specific grouping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
/// Groups the tasks based on the parallel region to which they bind.
omp_access_pteam = 1,
};

#endif
2 changes: 1 addition & 1 deletion offload/DeviceRTL/include/Interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -222,7 +222,7 @@ struct KernelEnvironmentTy;
int8_t __kmpc_is_spmd_exec_mode();

int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
KernelLaunchEnvironmentTy *KernelLaunchEnvironment);

void __kmpc_target_deinit();

Expand Down
2 changes: 1 addition & 1 deletion offload/DeviceRTL/include/State.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ extern Local<ThreadStateTy **> ThreadStates;

/// Initialize the state machinery. Must be called by all threads.
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment);
KernelLaunchEnvironmentTy *KernelLaunchEnvironment);

/// Return the kernel and kernel launch environment associated with the current
/// kernel. The former is static and contains compile time information that
Expand Down
14 changes: 7 additions & 7 deletions offload/DeviceRTL/src/Kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,8 @@ enum OMPTgtExecModeFlags : unsigned char {
};

static void
inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
initializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
// Order is important here.
synchronize::init(IsSPMD);
mapping::init(IsSPMD);
Expand Down Expand Up @@ -80,17 +80,17 @@ extern "C" {
/// \param Ident Source location identification, can be NULL.
///
int32_t __kmpc_target_init(KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
KernelLaunchEnvironmentTy *KernelLaunchEnvironment) {
ConfigurationEnvironmentTy &Configuration = KernelEnvironment.Configuration;
bool IsSPMD = Configuration.ExecMode & OMP_TGT_EXEC_MODE_SPMD;
bool UseGenericStateMachine = Configuration.UseGenericStateMachine;
if (IsSPMD) {
inititializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
KernelLaunchEnvironment);
initializeRuntime(/*IsSPMD=*/true, KernelEnvironment,
KernelLaunchEnvironment);
synchronize::threadsAligned(atomic::relaxed);
} else {
inititializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
KernelLaunchEnvironment);
initializeRuntime(/*IsSPMD=*/false, KernelEnvironment,
KernelLaunchEnvironment);
// No need to wait since only the main threads will execute user
// code and workers will run into a barrier right away.
}
Expand Down
49 changes: 47 additions & 2 deletions offload/DeviceRTL/src/State.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,35 @@ void SharedMemorySmartStackTy::pop(void *Ptr, uint64_t Bytes) {
memory::freeGlobal(Ptr, "Slow path shared memory deallocation");
}

struct DynCGroupMemTy {
void init(KernelLaunchEnvironmentTy *KLE, void *NativeDynCGroup) {
Size = 0;
Ptr = nullptr;
IsFallback = false;
Comment on lines +163 to +165
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Move to field initializers?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

With the clang::loader_uninitialized attribute in the DynCGroupMem variable, I can't use field initializers or a constructor.

if (!KLE)
return;

Size = KLE->DynCGroupMemSize;
if (void *Fallback = KLE->DynCGroupMemFallback) {
Ptr = static_cast<char *>(Fallback) + Size * omp_get_team_num();
IsFallback = true;
} else {
Ptr = static_cast<char *>(NativeDynCGroup);
}
}

char *getPtr(size_t Offset) const { return Ptr + Offset; }
bool isFallback() const { return IsFallback; }
size_t getSize() const { return Size; }

private:
char *Ptr;
size_t Size;
bool IsFallback;
};

[[clang::loader_uninitialized]] static Local<DynCGroupMemTy> DynCGroupMem;

} // namespace

void *memory::getDynamicBuffer() { return DynamicSharedBuffer; }
Expand Down Expand Up @@ -246,13 +275,18 @@ int returnValIfLevelIsActive(int Level, int Val, int DefaultVal,
} // namespace

void state::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
KernelLaunchEnvironmentTy &KernelLaunchEnvironment) {
KernelLaunchEnvironmentTy *KLE) {
SharedMemorySmartStack.init(IsSPMD);

if (KLE == reinterpret_cast<KernelLaunchEnvironmentTy *>(~0))
KLE = nullptr;

if (mapping::isInitialThreadInLevel0(IsSPMD)) {
DynCGroupMem.init(KLE, DynamicSharedBuffer);
TeamState.init(IsSPMD);
ThreadStates = nullptr;
KernelEnvironmentPtr = &KernelEnvironment;
KernelLaunchEnvironmentPtr = &KernelLaunchEnvironment;
KernelLaunchEnvironmentPtr = KLE;
}
}

Expand Down Expand Up @@ -430,6 +464,17 @@ int omp_get_team_num() { return mapping::getBlockIdInKernel(); }
int omp_get_initial_device(void) { return -1; }

int omp_is_initial_device(void) { return 0; }

void *omp_get_dyn_groupprivate_ptr(size_t Offset, int *IsFallback,
omp_access_t) {
if (IsFallback != nullptr)
*IsFallback = DynCGroupMem.isFallback();
return DynCGroupMem.getPtr(Offset);
}

size_t omp_get_dyn_groupprivate_size(omp_access_t) {
return DynCGroupMem.getSize();
}
}

extern "C" {
Expand Down
6 changes: 4 additions & 2 deletions offload/include/Shared/APITypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,10 @@ struct KernelArgsTy {
struct {
uint64_t NoWait : 1; // Was this kernel spawned with a `nowait` clause.
uint64_t IsCUDA : 1; // Was this kernel spawned via CUDA.
uint64_t Unused : 62;
} Flags = {0, 0, 0};
uint64_t AllowDynCGroupMemFallback : 1; // Allow fallback for dynamic cgroup
// mem fallback.
uint64_t Unused : 61;
} Flags = {0, 0, 0, 0};
// The number of teams (for x,y,z dimension).
uint32_t NumTeams[3] = {0, 0, 0};
// The number of threads (for x,y,z dimension).
Expand Down
4 changes: 3 additions & 1 deletion offload/include/Shared/Environment.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,9 +93,11 @@ struct KernelEnvironmentTy {
};

struct KernelLaunchEnvironmentTy {
void *ReductionBuffer = nullptr;
void *DynCGroupMemFallback = nullptr;
uint32_t ReductionCnt = 0;
uint32_t ReductionIterCnt = 0;
void *ReductionBuffer = nullptr;
uint32_t DynCGroupMemSize = 0;
};

#endif // OMPTARGET_SHARED_ENVIRONMENT_H
3 changes: 3 additions & 0 deletions offload/include/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ struct DeviceTy {
/// Indicate that there are pending images for this device or not.
void setHasPendingImages(bool V) { HasPendingImages = V; }

/// Get the maximum shared memory per team for any kernel.
uint64_t getMaxSharedTeamMemory();

private:
/// Deinitialize the device (and plugin).
void deinit();
Expand Down
14 changes: 13 additions & 1 deletion offload/include/omptarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ enum TargetAllocTy : int32_t {

inline KernelArgsTy CTorDTorKernelArgs = {1, 0, nullptr, nullptr,
nullptr, nullptr, nullptr, nullptr,
0, {0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};
0, {0,0,0,0}, {1, 0, 0}, {1, 0, 0}, 0};

struct DeviceTy;

Expand Down Expand Up @@ -273,10 +273,22 @@ struct __tgt_target_non_contig {
extern "C" {
#endif

/// The OpenMP access group type. The criterion for grupping tasks using a
/// specific grouping property.
enum omp_access_t {
/// Groups the tasks based on the contention group to which they belong.
omp_access_cgroup = 0,
/// Groups the tasks based on the parallel region to which they bind.
omp_access_pteam = 1,
};

void ompx_dump_mapping_tables(void);
int omp_get_num_devices(void);
int omp_get_device_num(void);
int omp_get_initial_device(void);
size_t
omp_get_groupprivate_limit(int device_num,
omp_access_t access_group = omp_access_cgroup);
void *omp_target_alloc(size_t Size, int DeviceNum);
void omp_target_free(void *DevicePtr, int DeviceNum);
int omp_target_is_present(const void *Ptr, int DeviceNum);
Expand Down
14 changes: 14 additions & 0 deletions offload/libomptarget/OpenMP/API.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,20 @@ EXTERN int omp_get_initial_device(void) {
return HostDevice;
}

EXTERN size_t omp_get_groupprivate_limit(int DeviceNum,
omp_access_t AccessGroup) {
TIMESCOPE();
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
if (DeviceNum == omp_get_initial_device())
return 0;

auto DeviceOrErr = PM->getDevice(DeviceNum);
if (!DeviceOrErr)
FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());

return DeviceOrErr->getMaxSharedTeamMemory();
}

EXTERN void *omp_target_alloc(size_t Size, int DeviceNum) {
TIMESCOPE_WITH_DETAILS("dst_dev=" + std::to_string(DeviceNum) +
";size=" + std::to_string(Size));
Expand Down
6 changes: 6 additions & 0 deletions offload/libomptarget/device.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -281,3 +281,9 @@ bool DeviceTy::useAutoZeroCopy() {
return false;
return RTL->use_auto_zero_copy(RTLDeviceID);
}

uint64_t DeviceTy::getMaxSharedTeamMemory() {
using DeviceQueryKind = llvm::omp::target::plugin::DeviceQueryKind;
return RTL->query_device_info(
RTLDeviceID, DeviceQueryKind::DEVICE_QUERY_MAX_SHARED_TEAM_MEM);
}
1 change: 1 addition & 0 deletions offload/libomptarget/exports
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ VERS1.0 {
omp_get_num_devices;
omp_get_device_num;
omp_get_initial_device;
omp_get_groupprivate_limit;
omp_target_alloc;
omp_target_free;
omp_target_is_present;
Expand Down
1 change: 1 addition & 0 deletions offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ typedef enum {
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_GRANULE = 6,
HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALIGNMENT = 7,
HSA_AMD_MEMORY_POOL_INFO_ACCESSIBLE_BY_ALL = 15,
HSA_AMD_MEMORY_POOL_INFO_ALLOC_MAX_SIZE = 16,
} hsa_amd_memory_pool_info_t;

typedef enum {
Expand Down
38 changes: 26 additions & 12 deletions offload/plugins-nextgen/amdgpu/src/rtl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,7 +273,6 @@ struct AMDGPUMemoryPoolTy {

if (auto Err = getAttr(HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, GlobalFlags))
return Err;

return Plugin::success();
}

Expand Down Expand Up @@ -543,6 +542,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {
return Err;
}

StaticBlockMemSize = GroupSize;

// Make sure it is a kernel symbol.
if (SymbolType != HSA_SYMBOL_KIND_KERNEL)
return Plugin::error(ErrorCode::INVALID_BINARY,
Expand All @@ -566,8 +567,8 @@ struct AMDGPUKernelTy : public GenericKernelTy {

/// Launch the AMDGPU kernel function.
Error launchImpl(GenericDeviceTy &GenericDevice, uint32_t NumThreads[3],
uint32_t NumBlocks[3], KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
uint32_t NumBlocks[3], uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs, KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const override;

/// Print more elaborate kernel launch info for AMDGPU
Expand Down Expand Up @@ -2020,6 +2021,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
if (auto Err = checkIfAPU())
return Err;

// Retrieve the size of the group memory.
for (const auto *Pool : AllMemoryPools) {
if (Pool->isGroup()) {
size_t Size = 0;
if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE, Size))
return Err;
MaxBlockSharedMemSize = Size;
break;
}
}

// Supports block shared memory natively.
HasNativeBlockSharedMem = true;

return Plugin::success();
}

Expand Down Expand Up @@ -2856,7 +2871,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
KernelArgsTy KernelArgs = {};
uint32_t NumBlocksAndThreads[3] = {1u, 1u, 1u};
if (auto Err = AMDGPUKernel.launchImpl(
*this, NumBlocksAndThreads, NumBlocksAndThreads, KernelArgs,
*this, NumBlocksAndThreads, NumBlocksAndThreads, 0, KernelArgs,
KernelLaunchParamsTy{}, AsyncInfoWrapper))
return Err;

Expand Down Expand Up @@ -3357,6 +3372,7 @@ struct AMDGPUPluginTy final : public GenericPluginTy {

Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t NumThreads[3], uint32_t NumBlocks[3],
uint32_t DynBlockMemSize,
KernelArgsTy &KernelArgs,
KernelLaunchParamsTy LaunchParams,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
Expand All @@ -3374,13 +3390,6 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
if (auto Err = ArgsMemoryManager.allocate(ArgsSize, &AllArgs))
return Err;

// Account for user requested dynamic shared memory.
uint32_t GroupSize = getGroupSize();
if (uint32_t MaxDynCGroupMem = std::max(
KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize())) {
GroupSize += MaxDynCGroupMem;
}

uint64_t StackSize;
if (auto Err = GenericDevice.getDeviceStackSize(StackSize))
return Err;
Expand Down Expand Up @@ -3432,9 +3441,14 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
KernelArgs.DynCGroupMem);
}

// Increase to the requested dynamic memory size for the device if needed.
DynBlockMemSize =
std::max(DynBlockMemSize, GenericDevice.getDynamicMemorySize());

// Push the kernel launch into the stream.
return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
GroupSize, StackSize, ArgsMemoryManager);
getStaticBlockMemSize() + DynBlockMemSize,
StackSize, ArgsMemoryManager);
}

Error AMDGPUKernelTy::printLaunchInfoDetails(GenericDeviceTy &GenericDevice,
Expand Down
Loading
Loading