Skip to content

Commit 69fb8c0

Browse files
committed
[AMDGPU][Offload] GFX90A coarse grain shared alloc option
1 parent d2546bf commit 69fb8c0

File tree

3 files changed

+19
-12
lines changed

3 files changed

+19
-12
lines changed

offload/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2924,6 +2924,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
29242924
OMPX_ApuMaps("OMPX_APU_MAPS", false),
29252925
OMPX_EnableGFX90ACoarseGrainUsmMaps(
29262926
"OMPX_ENABLE_GFX90A_COARSE_GRAIN_USM_MAPS", false),
2927+
OMPX_EnableGFX90ACoarseGrainSharedAlloc(
2928+
"OMPX_ENABLE_GFX90A_COARSE_GRAIN_SHARED_ALLOC", false),
29272929
OMPX_StrictSanityChecks("OMPX_STRICT_SANITY_CHECKS", false),
29282930
OMPX_SyncCopyBack("LIBOMPTARGET_SYNC_COPY_BACK", true),
29292931
OMPX_APUPrefaultMemcopy("LIBOMPTARGET_APU_PREFAULT_MEMCOPY", "true"),
@@ -4339,6 +4341,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
43394341
/// - Coarse graining upon USM map on MI200 needs to be enabled.
43404342
void specialBehaviorHandling() {
43414343
EnableGFX90ACoarseGrainUsmMaps = OMPX_EnableGFX90ACoarseGrainUsmMaps;
4344+
EnableGFX90ACoarseGrainSharedAlloc =
4345+
OMPX_EnableGFX90ACoarseGrainSharedAlloc;
43424346
}
43434347

43444348
bool IsGfx90aCoarseGrainUsmMapEnabledImpl() override final {
@@ -4460,6 +4464,12 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
44604464
/// OMPX_DISABLE_USM_MAPS
44614465
BoolEnvar OMPX_EnableGFX90ACoarseGrainUsmMaps;
44624466

4467+
/// Value of OMPX_ENABLE_GFX90A_COARSE_GRAIN_SHARED_ALLOC.
4468+
/// Use on MI200 systems to enable coarse grain
4469+
/// allocation of TARGET_ALLOC_SHARED memory.
4470+
/// Default is fine grain allocation.
4471+
BoolEnvar OMPX_EnableGFX90ACoarseGrainSharedAlloc;
4472+
44634473
/// Makes warnings turn into fatal errors
44644474
BoolEnvar OMPX_StrictSanityChecks;
44654475

@@ -4549,6 +4559,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
45494559
// residing on the page goes through implicit/explicit OpenMP map.
45504560
bool EnableGFX90ACoarseGrainUsmMaps = false;
45514561

4562+
// Set by OMPX_ENABLE_GFX90A_COARSE_GRAIN_SHARED_ALLOC environment variable.
4563+
// If set, TARGET_ALLOC_SHARED is allocated on coarse grain memory on MI200
4564+
bool EnableGFX90ACoarseGrainSharedAlloc = false;
4565+
45524566
/// True if in multi-device mode.
45534567
bool IsMultiDeviceEnabled = false;
45544568

@@ -5246,6 +5260,11 @@ void *AMDGPUDeviceTy::allocate(size_t Size, void *, TargetAllocTy Kind) {
52465260
break;
52475261
}
52485262

5263+
if (Kind == TARGET_ALLOC_SHARED && IsEquippedWithGFX90A &&
5264+
EnableGFX90ACoarseGrainSharedAlloc) {
5265+
MemoryPool = CoarseGrainedMemoryPools[0];
5266+
}
5267+
52495268
if (!MemoryPool) {
52505269
REPORT("No memory pool for the specified allocation kind\n");
52515270
return nullptr;

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1051,9 +1051,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
10511051
return OMPX_MinThreadsForLowTripCount;
10521052
}
10531053

1054-
virtual uint32_t getUseCoarseGrain() {
1055-
return OMPX_EnableCoarseAllocs;
1056-
}
10571054
/// Whether or not to reuse blocks for high trip count loops.
10581055
/// @see OMPX_ReuseBlocksForHighTripCount
10591056
bool getReuseBlocksForHighTripCount() {
@@ -1212,9 +1209,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
12121209
BoolEnvar OMPX_ReuseBlocksForHighTripCount =
12131210
BoolEnvar("LIBOMPTARGET_REUSE_BLOCKS_FOR_HIGH_TRIP_COUNT", true);
12141211

1215-
/// Envar to enable coasre alocs.
1216-
BoolEnvar OMPX_EnableCoarseAllocs;
1217-
12181212
protected:
12191213
/// Environment variables defined by the LLVM OpenMP implementation
12201214
/// regarding the initial number of streams and events.

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -927,7 +927,6 @@ GenericDeviceTy::GenericDeviceTy(GenericPluginTy &Plugin, int32_t DeviceId,
927927
OMPX_InitialNumEvents("LIBOMPTARGET_NUM_INITIAL_EVENTS", 1),
928928
OMPX_NumMultiDevices("LIBOMPTARGET_NUM_MULTI_DEVICES", 0),
929929
OMPX_EnableRuntimeAutotuning("OMPX_ENABLE_RUNTIME_AUTOTUNING", false),
930-
OMPX_EnableCoarseAllocs("OMPX_ENABLE_COARSE_ALLOCS", false),
931930
DeviceId(DeviceId), GridValues(OMPGridValues),
932931
PeerAccesses(NumDevices, PeerAccessState::PENDING), PeerAccessesLock(),
933932
PinnedAllocs(*this), RPCServer(nullptr) {
@@ -2207,11 +2206,6 @@ void *GenericPluginTy::data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
22072206
}
22082207
assert(*AllocOrErr && "Null pointer upon successful allocation");
22092208

2210-
// Method has no effect when the CUDA Plugin is used.
2211-
// This method can only be called if HostPtr is not null.
2212-
if (HostPtr && Kind == TARGET_ALLOC_SHARED && getDevice(DeviceId).getUseCoarseGrain() )
2213-
set_coarse_grain_mem_region(DeviceId, HostPtr, Size);
2214-
22152209
return *AllocOrErr;
22162210
}();
22172211
T.res(R);

0 commit comments

Comments
 (0)