Skip to content

Commit 33d4bc1

Browse files
carlobertollironlieb
authored andcommitted
[OpenMP][MI300A][libomptarget] Add option to make map constructs prefault pages on target GPU.
Preafulting pages allows to execute applications built with xnack- in unified_shared_memory mode. A translation for each mapped page is added to the GPU page table and there is no need to run the XNACK algorithm when a page is referenced by a GPU thread for the first time. Change-Id: Ib3939510ecadc05e8a8f9a109d6f9d28eef3811b
1 parent 995a496 commit 33d4bc1

File tree

11 files changed

+264
-68
lines changed

11 files changed

+264
-68
lines changed

openmp/libomptarget/include/omptargetplugin.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,8 @@ int32_t __tgt_rtl_number_of_devices(void);
3737
// Return if the system is equipped with an APU
3838
bool __tgt_rtl_has_apu_device(void);
3939

40-
// Returns true, if the system is equipped with an GFX90a
41-
bool __tgt_rtl_has_gfx90a_device(void);
40+
// Returns true, if the system is equipped with a dGPU which supports USM.
41+
bool __tgt_rtl_has_USM_capable_dGPU(void);
4242

4343
bool __tgt_rtl_are_allocations_for_maps_on_apus_disabled(void);
4444

openmp/libomptarget/include/rtl.h

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,8 +37,9 @@ struct RTLInfoTy {
3737
typedef int32_t(is_data_exchangable_ty)(int32_t, int32_t);
3838
typedef int32_t(number_of_devices_ty)();
3939
typedef bool(has_apu_device_ty)();
40-
typedef bool(has_gfx90a_device_ty)();
40+
typedef bool(has_USM_capable_dGPU_ty)();
4141
typedef bool(are_allocations_for_maps_on_apus_disabled_ty)();
42+
typedef bool(requested_prepopulate_gpu_page_table_ty)();
4243
typedef bool(is_no_maps_check_ty)();
4344
typedef bool(is_fine_grained_memory_enabled_ty)();
4445
typedef int32_t(init_device_ty)(int32_t);
@@ -74,6 +75,7 @@ struct RTLInfoTy {
7475
typedef int32_t(sync_event_ty)(int32_t, void *);
7576
typedef int32_t(destroy_event_ty)(int32_t, void *);
7677
typedef int(set_coarse_grain_mem_region_ty)(int32_t, void *, int64_t);
78+
typedef int(prepopulate_page_table_ty)(int32_t, void *, int64_t);
7779
typedef int32_t(query_coarse_grain_mem_region_ty)(int32_t, void *, int64_t);
7880
typedef int32_t(enable_access_to_all_agents_ty)(void *, int32_t);
7981
typedef int32_t(release_async_info_ty)(int32_t, __tgt_async_info *);
@@ -107,9 +109,11 @@ struct RTLInfoTy {
107109
is_data_exchangable_ty *is_data_exchangable = nullptr;
108110
number_of_devices_ty *number_of_devices = nullptr;
109111
has_apu_device_ty *has_apu_device = nullptr;
110-
has_gfx90a_device_ty *has_gfx90a_device = nullptr;
112+
has_USM_capable_dGPU_ty *has_USM_capable_dGPU = nullptr;
111113
are_allocations_for_maps_on_apus_disabled_ty
112114
*are_allocations_for_maps_on_apus_disabled = nullptr;
115+
requested_prepopulate_gpu_page_table_ty
116+
*requested_prepopulate_gpu_page_table = nullptr;
113117
is_no_maps_check_ty *is_no_maps_check = nullptr;
114118
is_fine_grained_memory_enabled_ty *is_fine_grained_memory_enabled = nullptr;
115119
init_device_ty *init_device = nullptr;
@@ -145,6 +149,7 @@ struct RTLInfoTy {
145149
data_lock_ty *data_lock = nullptr;
146150
data_unlock_ty *data_unlock = nullptr;
147151
set_coarse_grain_mem_region_ty *set_coarse_grain_mem_region = nullptr;
152+
prepopulate_page_table_ty *prepopulate_page_table = nullptr;
148153
query_coarse_grain_mem_region_ty *query_coarse_grain_mem_region = nullptr;
149154
enable_access_to_all_agents_ty *enable_access_to_all_agents = nullptr;
150155
data_notify_mapped_ty *data_notify_mapped = nullptr;

openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp

Lines changed: 150 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -2377,6 +2377,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
23772377
if (Error Err = AMDImage->loadExecutable(*this))
23782378
return std::move(Err);
23792379

2380+
Plugin::get().checkAndAdjustUsmModeForTargetImage(TgtImage);
2381+
23802382
return AMDImage;
23812383
}
23822384

@@ -2682,6 +2684,20 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
26822684
return coarse_grain_mem_tab->contains((const uintptr_t)ptr, size);
26832685
}
26842686

2687+
Error prepopulatePageTableImpl(void *ptr, int64_t size) override final {
2688+
// Instruct ROCr that the [ptr, ptr+size-1] pages are
2689+
// coarse grain
2690+
hsa_amd_svm_attribute_pair_t tt;
2691+
tt.attribute = HSA_AMD_SVM_ATTRIB_AGENT_ACCESSIBLE_IN_PLACE;
2692+
tt.value = Agent.handle;
2693+
hsa_status_t err = hsa_amd_svm_attributes_set(ptr, size, &tt, 1);
2694+
if (err != HSA_STATUS_SUCCESS) {
2695+
return Plugin::error("Failed to prepopulate GPU page table.");
2696+
}
2697+
2698+
return Plugin::success();
2699+
}
2700+
26852701
/// Create an event.
26862702
Error createEventImpl(void **EventPtrStorage) override {
26872703
AMDGPUEventTy **Event = reinterpret_cast<AMDGPUEventTy **>(EventPtrStorage);
@@ -3419,8 +3435,10 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
34193435
#endif
34203436

34213437
// Initialize flags for device type:
3422-
hasGfx90aDevice();
34233438
hasAPUDevice();
3439+
// check for dGPUs with USM support
3440+
hasGfx90aDevice();
3441+
hasMI300xDevice();
34243442

34253443
readEnvVars();
34263444

@@ -3463,7 +3481,19 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
34633481
#define ALDEBARAN_MAJOR 9
34643482
#define ALDEBARAN_STEPPING 10
34653483

3466-
bool hasGfx90aDevice() override final {
3484+
bool hasMI300xDevice() {
3485+
if (HasMi300xDevice != -1)
3486+
return HasMi300xDevice;
3487+
3488+
if (!Initialized)
3489+
FATAL_MESSAGE(1, "%s", "hasMI300xDevice called on uninitialized plugin");
3490+
// On splinter the MI300X identifies itself as a GFX941. Use GFX name to
3491+
// distinguish for testing.
3492+
HasMi300xDevice = checkForDeviceByGFXName("gfx941");
3493+
return HasMi300xDevice;
3494+
}
3495+
3496+
bool hasGfx90aDevice() {
34673497
if (HasGFX90ADevice != -1)
34683498
return HasGFX90ADevice;
34693499

@@ -3474,10 +3504,18 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
34743504
return HasGFX90ADevice;
34753505
}
34763506

3507+
bool hasDGpuWithUsmSupport() override final {
3508+
return hasGfx90aDevice() || hasMI300xDevice();
3509+
}
3510+
34773511
bool AreAllocationsForMapsOnApusDisabled() override final {
34783512
return DisableAllocationsForMapsOnApus;
34793513
}
34803514

3515+
bool requestedPrepopulateGPUPageTable() override final {
3516+
return PrepopulateGPUPageTable;
3517+
}
3518+
34813519
bool IsNoMapsCheck() override final { return NoUSMMapChecks; }
34823520

34833521
bool IsFineGrainedMemoryEnabled() override final {
@@ -3491,6 +3529,8 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
34913529
NoMapChecks = BoolEnvar("OMPX_DISABLE_MAPS", true);
34923530
DisableUsmMaps = BoolEnvar("OMPX_DISABLE_USM_MAPS", false);
34933531
HsaXnack = BoolEnvar("HSA_XNACK", false);
3532+
APUPrefault = BoolEnvar("OMPX_EAGER_ZERO_COPY_MAPS", false);
3533+
ZeroCopyForMapsOnUsm = BoolEnvar("OMPX_APU_MAPS", false);
34943534
}
34953535

34963536
void setUpEnv() override final {
@@ -3502,6 +3542,15 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
35023542
if (DisableUsmMaps.get() == true) {
35033543
EnableFineGrainedMemory = true;
35043544
}
3545+
3546+
if (hasAPUDevice()) {
3547+
// OMPX_EAGER_ZERO_COPY_MAPS=1 && HSA_XNACK=0 (XNACK-disabled)
3548+
// && default (non-USM) program
3549+
if ((APUPrefault.get() == true) && !IsXnackEnabled() &&
3550+
!(Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)) {
3551+
PrepopulateGPUPageTable = true;
3552+
}
3553+
}
35053554
}
35063555

35073556
/// Check whether the image is compatible with an AMDGPU device.
@@ -3545,38 +3594,74 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
35453594
}
35463595

35473596
void checkAndAdjustUsmModeForTargetImage(
3548-
__tgt_device_image *TgtImage) override final {
3597+
const __tgt_device_image *TgtImage) override final {
35493598
assert((TgtImage != nullptr) && "TgtImage is nullptr");
35503599
assert(!(Plugin::get().getRequiresFlags() & OMP_REQ_UNDEFINED) &&
35513600
"Requires flags are not set.");
35523601

3553-
if (!(hasAPUDevice() || hasGfx90aDevice()))
3602+
if (!(hasAPUDevice() || hasDGpuWithUsmSupport()))
35543603
return;
35553604

35563605
bool IsXnackRequired =
35573606
Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY;
3558-
35593607
utils::XnackBuildMode BinaryXnackMode =
35603608
utils::extractXnackModeFromBinary(TgtImage);
35613609

3610+
if (IsXnackRequired) {
3611+
handleImageRequiresUsmMode(BinaryXnackMode);
3612+
} else {
3613+
handleDefaultMode(BinaryXnackMode);
3614+
}
3615+
}
3616+
3617+
void handleImageRequiresUsmMode(utils::XnackBuildMode xnackImageMode) {
3618+
bool IsXnackActiveOnSystem = IsXnackEnabled();
3619+
3620+
if ((xnackImageMode == utils::XnackBuildMode::XNACK_ANY) ||
3621+
(xnackImageMode == utils::XnackBuildMode::XNACK_PLUS &&
3622+
IsXnackActiveOnSystem) ||
3623+
(xnackImageMode == utils::XnackBuildMode::XNACK_MINUS &&
3624+
!IsXnackActiveOnSystem)) {
3625+
DisableAllocationsForMapsOnApus = true; // Zero-copy
3626+
3627+
if (APUPrefault.get() && hasAPUDevice())
3628+
PrepopulateGPUPageTable = true; // Pre-faulting
3629+
}
3630+
3631+
if (!IsXnackActiveOnSystem &&
3632+
(xnackImageMode != utils::XnackBuildMode::XNACK_PLUS)) {
3633+
FAILURE_MESSAGE(
3634+
"Running a program that requries XNACK on a system where XNACK is "
3635+
"disabled! This may potentially cause memory errors! Just saying.\n");
3636+
}
3637+
}
3638+
3639+
void handleDefaultMode(utils::XnackBuildMode xnackImageMode) {
3640+
// assuming that copying is required
35623641
DisableAllocationsForMapsOnApus = false;
3642+
bool IsXnackActiveOnSystem = IsXnackEnabled();
35633643

3564-
if (IsXnackEnabled()) {
3565-
if (!IsXnackRequired) {
3566-
switch (BinaryXnackMode) {
3567-
case utils::XnackBuildMode::XNACK_PLUS:
3568-
case utils::XnackBuildMode::XNACK_ANY:
3569-
DisableAllocationsForMapsOnApus = true; // Zero-copy
3570-
}
3571-
return;
3572-
}
3573-
} else {
3574-
if (IsXnackRequired) {
3575-
FAILURE_MESSAGE(
3576-
"XNACK is disabled. However, the program requires XNACK "
3577-
"support. Enable XNACK and re-run the program.\n");
3644+
if (IsXnackActiveOnSystem &&
3645+
(hasAPUDevice() || ZeroCopyForMapsOnUsm.get()) &&
3646+
((xnackImageMode == utils::XnackBuildMode::XNACK_ANY) ||
3647+
(xnackImageMode == utils::XnackBuildMode::XNACK_PLUS))) {
3648+
DisableAllocationsForMapsOnApus = true; // Zero-copy
3649+
3650+
if (hasAPUDevice() && APUPrefault.get()) {
3651+
PrepopulateGPUPageTable = true; // Pre-faulting
35783652
}
3653+
return;
3654+
}
3655+
3656+
if (!IsXnackActiveOnSystem && hasAPUDevice() && APUPrefault.get() &&
3657+
((xnackImageMode == utils::XnackBuildMode::XNACK_ANY) ||
3658+
(xnackImageMode == utils::XnackBuildMode::XNACK_MINUS))) {
3659+
DisableAllocationsForMapsOnApus = true; // Zero-copy
3660+
PrepopulateGPUPageTable = true; // Pre-faulting
3661+
return;
35793662
}
3663+
3664+
return;
35803665
}
35813666

35823667
/// This plugin does not support exchanging data between two devices.
@@ -3649,11 +3734,10 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
36493734
return ((HsaXnack.get()) || (utils::IsXnackEnabledViaKernelParam()));
36503735
}
36513736

3652-
bool checkForDeviceByGFXName(const llvm::StringRef GfxLookUpName) {
3653-
bool CheckForMI300A =
3654-
(GfxLookUpName.find_insensitive("gfx940") != llvm::StringRef::npos);
3737+
bool checkForDeviceByGFXName(const llvm::StringRef GfxLookUpName,
3738+
char mi300Specifier = ' ') {
3739+
36553740
char GfxName[64];
3656-
llvm::StringRef GfxNameRef = llvm::StringRef(GfxName);
36573741

36583742
for (hsa_agent_t GPUAgent : KernelAgents) {
36593743
std::memset((void *)&GfxName, 0, sizeof(char) * 64);
@@ -3664,22 +3748,37 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
36643748
if (Status != HSA_STATUS_SUCCESS)
36653749
continue;
36663750

3667-
if (GfxLookUpName.find_insensitive(GfxNameRef) != llvm::StringRef::npos) {
3668-
// Special handling for MI300. We will have to distinguish between an
3669-
// MI300A and X
3670-
if (CheckForMI300A) {
3671-
uint32_t ChipID = 0;
3672-
Status = hsa_agent_get_info(
3673-
GPUAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, &ChipID);
3751+
llvm::StringRef GfxNameRef = llvm::StringRef(GfxName);
3752+
3753+
if (GfxLookUpName.equals_insensitive(GfxNameRef)) {
3754+
if (mi300Specifier == ' ')
3755+
return true;
36743756

3675-
if (Status != HSA_STATUS_SUCCESS) {
3676-
continue;
3677-
}
3757+
// Special handling for MI300. We will have to distinguish between
3758+
// an MI300A and X
3759+
uint32_t ChipID = 0;
3760+
Status = hsa_agent_get_info(
3761+
GPUAgent, (hsa_agent_info_t)HSA_AMD_AGENT_INFO_CHIP_ID, &ChipID);
36783762

3679-
if ((ChipID & 0x1))
3680-
continue;
3763+
if (Status != HSA_STATUS_SUCCESS) {
3764+
continue;
3765+
}
3766+
3767+
bool IsMi300X = ChipID & 0x1;
3768+
3769+
switch (mi300Specifier) {
3770+
case 'A':
3771+
case 'a':
3772+
if (!IsMi300X)
3773+
return true;
3774+
break;
3775+
case 'x':
3776+
if (IsMi300X) // We are looking for a MI300X
3777+
return true;
3778+
break;
3779+
default:
3780+
FAILURE_MESSAGE("Unknown MI300 specifier!\n");
36813781
}
3682-
return true;
36833782
}
36843783
}
36853784
return false;
@@ -3693,18 +3792,31 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
36933792
/// Flag that shows if device is a GFX90A AMD GPU
36943793
int16_t HasGFX90ADevice{-1};
36953794

3795+
int16_t HasMi300xDevice{-1};
3796+
36963797
/// Flag that shows if device is an APU device
36973798
int16_t HasAPUDevice{-1};
36983799

36993800
BoolEnvar NoMapChecks;
37003801
BoolEnvar DisableUsmMaps;
37013802
BoolEnvar HsaXnack;
3803+
BoolEnvar APUPrefault;
3804+
3805+
// Set by OMPX_APU_MAPS
3806+
// Enables code that detect if zero copying is possible. If so, the variable
3807+
// DisableAllocationsForMapsOnApus is set to 'true'.
3808+
BoolEnvar ZeroCopyForMapsOnUsm;
37023809

3703-
// Set by OMPX_APU_MAPS environment variable.
37043810
// If set, maps cause no copy operations. USM is used instead. Allocated
3705-
// memory remains coarse grained.
3811+
// memory remains coarse grained. The variable is only considered to be set if
3812+
// ZeroCopyForMapsOnUsm (OMPX_APU_MAPS) is set.
37063813
bool DisableAllocationsForMapsOnApus{false};
37073814

3815+
// Set by OMPX_EAGER_ZERO_COPY_MAPS environment variable.
3816+
// If set, map clauses provoke prefaulting of the GPU
3817+
// page table.
3818+
bool PrepopulateGPUPageTable{false};
3819+
37083820
// Set by OMPX_DISABLE_MAPS environment variable.
37093821
// When active (default value), maps are ignored by the runtime
37103822
bool NoUSMMapChecks{true};

openmp/libomptarget/plugins-nextgen/amdgpu/utils/UtilitiesRTL.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,7 +178,7 @@ bool isImageCompatibleWithEnv(const __tgt_image_info *Info,
178178

179179
// Check target image for XNACK mode (XNACK+, XNACK-ANY, XNACK-)
180180
[[nodiscard]] XnackBuildMode
181-
extractXnackModeFromBinary(__tgt_device_image *TgtImage) {
181+
extractXnackModeFromBinary(const __tgt_device_image *TgtImage) {
182182
assert((TgtImage != nullptr) && "TgtImage is nullptr.");
183183
u_int16_t EFlags = elf_get_eflags(TgtImage);
184184

0 commit comments

Comments
 (0)