Skip to content

Commit f354f62

Browse files
[Offload] Add MPI Proxy Plugin
Co-authored-by: Guilherme Valarini <[email protected]>
1 parent 10a1ea9 commit f354f62

33 files changed

+4306
-66
lines changed

offload/CMakeLists.txt

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,7 @@ if(DEFINED LIBOMPTARGET_BUILD_CUDA_PLUGIN OR
139139
message(WARNING "Option removed, use 'LIBOMPTARGET_PLUGINS_TO_BUILD' instead")
140140
endif()
141141

142-
set(LIBOMPTARGET_ALL_PLUGIN_TARGETS amdgpu cuda host)
142+
set(LIBOMPTARGET_ALL_PLUGIN_TARGETS mpi amdgpu cuda host)
143143
set(LIBOMPTARGET_PLUGINS_TO_BUILD "all" CACHE STRING
144144
"Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
145145

@@ -194,8 +194,10 @@ set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-g
194194
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} powerpc64-ibm-linux-gnu-LTO")
195195
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu")
196196
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu-LTO")
197+
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} x86_64-unknown-linux-gnu-mpi")
197198
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda")
198199
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-LTO")
200+
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-mpi")
199201
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} nvptx64-nvidia-cuda-JIT-LTO")
200202
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu")
201203
set (LIBOMPTARGET_ALL_TARGETS "${LIBOMPTARGET_ALL_TARGETS} s390x-ibm-linux-gnu-LTO")
@@ -341,6 +343,8 @@ set(LIBOMPTARGET_LLVM_LIBRARY_DIR "${LLVM_LIBRARY_DIR}" CACHE STRING
341343
set(LIBOMPTARGET_LLVM_LIBRARY_INTDIR "${LIBOMPTARGET_INTDIR}" CACHE STRING
342344
"Path to folder where intermediate libraries will be output")
343345

346+
set(LIBOMPTARGET_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/src)
347+
344348
# Build offloading plugins and device RTLs if they are available.
345349
add_subdirectory(plugins-nextgen)
346350
add_subdirectory(DeviceRTL)

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 63 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -1208,130 +1208,141 @@ struct GenericPluginTy {
12081208

12091209
/// Returns non-zero if the \p Image is compatible with the plugin. This
12101210
/// function does not require the plugin to be initialized before use.
1211-
int32_t is_plugin_compatible(__tgt_device_image *Image);
1211+
virtual int32_t is_plugin_compatible(__tgt_device_image *Image);
12121212

12131213
/// Returns non-zero if the \p Image is compatible with the device.
1214-
int32_t is_device_compatible(int32_t DeviceId, __tgt_device_image *Image);
1214+
virtual int32_t is_device_compatible(int32_t DeviceId,
1215+
__tgt_device_image *Image);
12151216

12161217
/// Returns non-zero if the plugin device has been initialized.
1217-
int32_t is_device_initialized(int32_t DeviceId) const;
1218+
virtual int32_t is_device_initialized(int32_t DeviceId) const;
12181219

12191220
/// Initialize the device inside of the plugin.
1220-
int32_t init_device(int32_t DeviceId);
1221+
virtual int32_t init_device(int32_t DeviceId);
12211222

12221223
/// Return the number of devices this plugin can support.
1223-
int32_t number_of_devices();
1224+
virtual int32_t number_of_devices();
12241225

12251226
/// Returns non-zero if the data can be exchanged between the two devices.
1226-
int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
1227+
virtual int32_t is_data_exchangable(int32_t SrcDeviceId, int32_t DstDeviceId);
12271228

12281229
/// Initializes the record and replay mechanism inside the plugin.
1229-
int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
1230-
void *VAddr, bool isRecord, bool SaveOutput,
1231-
uint64_t &ReqPtrArgOffset);
1230+
virtual int32_t initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
1231+
void *VAddr, bool isRecord,
1232+
bool SaveOutput,
1233+
uint64_t &ReqPtrArgOffset);
12321234

12331235
/// Loads the associated binary into the plugin and returns a handle to it.
1234-
int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
1235-
__tgt_device_binary *Binary);
1236+
virtual int32_t load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
1237+
__tgt_device_binary *Binary);
12361238

12371239
/// Allocates memory that is accessively to the given device.
1238-
void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr, int32_t Kind);
1240+
virtual void *data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
1241+
int32_t Kind);
12391242

12401243
/// Deallocates memory on the given device.
1241-
int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
1244+
virtual int32_t data_delete(int32_t DeviceId, void *TgtPtr, int32_t Kind);
12421245

12431246
/// Locks / pins host memory using the plugin runtime.
1244-
int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1245-
void **LockedPtr);
1247+
virtual int32_t data_lock(int32_t DeviceId, void *Ptr, int64_t Size,
1248+
void **LockedPtr);
12461249

12471250
/// Unlocks / unpins host memory using the plugin runtime.
1248-
int32_t data_unlock(int32_t DeviceId, void *Ptr);
1251+
virtual int32_t data_unlock(int32_t DeviceId, void *Ptr);
12491252

12501253
/// Notify the runtime about a new mapping that has been created outside.
1251-
int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr, int64_t Size);
1254+
virtual int32_t data_notify_mapped(int32_t DeviceId, void *HstPtr,
1255+
int64_t Size);
12521256

12531257
/// Notify t he runtime about a mapping that has been deleted.
1254-
int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
1258+
virtual int32_t data_notify_unmapped(int32_t DeviceId, void *HstPtr);
12551259

12561260
/// Copy data to the given device.
1257-
int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1258-
int64_t Size);
1261+
virtual int32_t data_submit(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1262+
int64_t Size);
12591263

12601264
/// Copy data to the given device asynchronously.
1261-
int32_t data_submit_async(int32_t DeviceId, void *TgtPtr, void *HstPtr,
1262-
int64_t Size, __tgt_async_info *AsyncInfoPtr);
1265+
virtual int32_t data_submit_async(int32_t DeviceId, void *TgtPtr,
1266+
void *HstPtr, int64_t Size,
1267+
__tgt_async_info *AsyncInfoPtr);
12631268

12641269
/// Copy data from the given device.
1265-
int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1266-
int64_t Size);
1270+
virtual int32_t data_retrieve(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1271+
int64_t Size);
12671272

12681273
/// Copy data from the given device asynchornously.
1269-
int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr, void *TgtPtr,
1270-
int64_t Size, __tgt_async_info *AsyncInfoPtr);
1274+
virtual int32_t data_retrieve_async(int32_t DeviceId, void *HstPtr,
1275+
void *TgtPtr, int64_t Size,
1276+
__tgt_async_info *AsyncInfoPtr);
12711277

12721278
/// Exchange memory addresses between two devices.
1273-
int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr, int32_t DstDeviceId,
1274-
void *DstPtr, int64_t Size);
1279+
virtual int32_t data_exchange(int32_t SrcDeviceId, void *SrcPtr,
1280+
int32_t DstDeviceId, void *DstPtr,
1281+
int64_t Size);
12751282

12761283
/// Exchange memory addresses between two devices asynchronously.
1277-
int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
1278-
int DstDeviceId, void *DstPtr, int64_t Size,
1279-
__tgt_async_info *AsyncInfo);
1284+
virtual int32_t data_exchange_async(int32_t SrcDeviceId, void *SrcPtr,
1285+
int DstDeviceId, void *DstPtr,
1286+
int64_t Size,
1287+
__tgt_async_info *AsyncInfo);
12801288

12811289
/// Begin executing a kernel on the given device.
1282-
int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr, void **TgtArgs,
1283-
ptrdiff_t *TgtOffsets, KernelArgsTy *KernelArgs,
1284-
__tgt_async_info *AsyncInfoPtr);
1290+
virtual int32_t launch_kernel(int32_t DeviceId, void *TgtEntryPtr,
1291+
void **TgtArgs, ptrdiff_t *TgtOffsets,
1292+
KernelArgsTy *KernelArgs,
1293+
__tgt_async_info *AsyncInfoPtr);
12851294

12861295
/// Synchronize an asyncrhonous queue with the plugin runtime.
1287-
int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
1296+
virtual int32_t synchronize(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
12881297

12891298
/// Query the current state of an asynchronous queue.
1290-
int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
1299+
virtual int32_t query_async(int32_t DeviceId, __tgt_async_info *AsyncInfoPtr);
12911300

12921301
/// Prints information about the given devices supported by the plugin.
1293-
void print_device_info(int32_t DeviceId);
1302+
virtual void print_device_info(int32_t DeviceId);
12941303

12951304
/// Creates an event in the given plugin if supported.
1296-
int32_t create_event(int32_t DeviceId, void **EventPtr);
1305+
virtual int32_t create_event(int32_t DeviceId, void **EventPtr);
12971306

12981307
/// Records an event that has occurred.
1299-
int32_t record_event(int32_t DeviceId, void *EventPtr,
1300-
__tgt_async_info *AsyncInfoPtr);
1308+
virtual int32_t record_event(int32_t DeviceId, void *EventPtr,
1309+
__tgt_async_info *AsyncInfoPtr);
13011310

13021311
/// Wait until an event has occurred.
1303-
int32_t wait_event(int32_t DeviceId, void *EventPtr,
1304-
__tgt_async_info *AsyncInfoPtr);
1312+
virtual int32_t wait_event(int32_t DeviceId, void *EventPtr,
1313+
__tgt_async_info *AsyncInfoPtr);
13051314

13061315
/// Syncrhonize execution until an event is done.
1307-
int32_t sync_event(int32_t DeviceId, void *EventPtr);
1316+
virtual int32_t sync_event(int32_t DeviceId, void *EventPtr);
13081317

13091318
/// Remove the event from the plugin.
1310-
int32_t destroy_event(int32_t DeviceId, void *EventPtr);
1319+
virtual int32_t destroy_event(int32_t DeviceId, void *EventPtr);
13111320

13121321
/// Remove the event from the plugin.
13131322
void set_info_flag(uint32_t NewInfoLevel);
13141323

13151324
/// Creates an asynchronous queue for the given plugin.
1316-
int32_t init_async_info(int32_t DeviceId, __tgt_async_info **AsyncInfoPtr);
1325+
virtual int32_t init_async_info(int32_t DeviceId,
1326+
__tgt_async_info **AsyncInfoPtr);
13171327

13181328
/// Creates device information to be used for diagnostics.
1319-
int32_t init_device_info(int32_t DeviceId, __tgt_device_info *DeviceInfo,
1320-
const char **ErrStr);
1329+
virtual int32_t init_device_info(int32_t DeviceId,
1330+
__tgt_device_info *DeviceInfo,
1331+
const char **ErrStr);
13211332

13221333
/// Sets the offset into the devices for use by OMPT.
13231334
int32_t set_device_identifier(int32_t UserId, int32_t DeviceId);
13241335

13251336
/// Returns if the plugin can support auotmatic copy.
1326-
int32_t use_auto_zero_copy(int32_t DeviceId);
1337+
virtual int32_t use_auto_zero_copy(int32_t DeviceId);
13271338

13281339
/// Look up a global symbol in the given binary.
1329-
int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
1330-
const char *Name, void **DevicePtr);
1340+
virtual int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
1341+
const char *Name, void **DevicePtr);
13311342

13321343
/// Look up a kernel function in the given binary.
1333-
int32_t get_function(__tgt_device_binary Binary, const char *Name,
1334-
void **KernelPtr);
1344+
virtual int32_t get_function(__tgt_device_binary Binary, const char *Name,
1345+
void **KernelPtr);
13351346

13361347
private:
13371348
/// Indicates if the platform runtime has been fully initialized.

offload/plugins-nextgen/host/src/rtl.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,7 @@
4343
#endif
4444

4545
// The number of devices in this plugin.
46-
#define NUM_DEVICES 4
46+
#define NUM_DEVICES 1
4747

4848
namespace llvm {
4949
namespace omp {
Lines changed: 134 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,134 @@
1+
# Looking for MPI...
2+
find_package(MPI QUIET)
3+
4+
if(NOT(CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)$" AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
5+
message(STATUS "Not building MPI offloading plugin: only support MPI in Linux x86_64 or ppc64le hosts.")
6+
return()
7+
elseif(NOT MPI_CXX_FOUND)
8+
message(STATUS "Not building MPI offloading plugin: MPI not found in system.")
9+
return()
10+
endif()
11+
12+
message(STATUS "Building MPI Proxy offloading plugin.")
13+
14+
# Event System
15+
add_subdirectory(event_system)
16+
17+
# MPI Plugin
18+
19+
# Create the library and add the default arguments.
20+
add_target_library(omptarget.rtl.mpi MPI)
21+
22+
target_sources(omptarget.rtl.mpi PRIVATE
23+
src/rtl.cpp
24+
)
25+
26+
target_link_libraries(omptarget.rtl.mpi PRIVATE
27+
EventSystem
28+
)
29+
30+
# Add include directories
31+
target_include_directories(omptarget.rtl.mpi PRIVATE
32+
${LIBOMPTARGET_INCLUDE_DIR})
33+
34+
# Set C++20 as the target standard for this plugin.
35+
set_target_properties(omptarget.rtl.mpi
36+
PROPERTIES
37+
CXX_STANDARD 20
38+
CXX_STANDARD_REQUIRED ON)
39+
40+
41+
# Configure testing for the MPI plugin.
42+
list(APPEND LIBOMPTARGET_TESTED_PLUGINS "omptarget.rtl.mpi")
43+
# Report to the parent scope that we are building a plugin for MPI.
44+
set(LIBOMPTARGET_TESTED_PLUGINS "${LIBOMPTARGET_TESTED_PLUGINS}" PARENT_SCOPE)
45+
46+
# Define the target specific triples and ELF machine values.
47+
set(LIBOMPTARGET_SYSTEM_TARGETS
48+
"${LIBOMPTARGET_SYSTEM_TARGETS} x86_64-pc-linux-gnu-mpi nvptx64-nvidia-cuda-mpi" PARENT_SCOPE)
49+
50+
# Remote Plugin Manager
51+
message(STATUS "Building the llvm-offload-mpi-proxy-device")
52+
53+
set(LIBOMPTARGET_ALL_REMOTE_PLUGIN_TARGETS amdgpu cuda host)
54+
set(LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "all" CACHE STRING
55+
"Semicolon-separated list of plugins to use: cuda, amdgpu, host or \"all\".")
56+
57+
if(LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD STREQUAL "all")
58+
set(LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD ${LIBOMPTARGET_ALL_REMOTE_PLUGIN_TARGETS})
59+
endif()
60+
61+
if(NOT CMAKE_SYSTEM_NAME MATCHES "Linux" AND
62+
"host" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
63+
message(STATUS "Not building remote host plugin: only Linux systems are supported")
64+
list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "host")
65+
endif()
66+
if(NOT (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86_64)|(ppc64le)|(aarch64)$"
67+
AND CMAKE_SYSTEM_NAME MATCHES "Linux"))
68+
if("amdgpu" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
69+
message(STATUS "Not building remote AMDGPU plugin: only support AMDGPU in "
70+
"Linux x86_64, ppc64le, or aarch64 hosts")
71+
list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "amdgpu")
72+
endif()
73+
if("cuda" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
74+
message(STATUS "Not building remote CUDA plugin: only support CUDA in "
75+
"Linux x86_64, ppc64le, or aarch64 hosts")
76+
list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "cuda")
77+
endif()
78+
endif()
79+
if("mpi" IN_LIST LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
80+
message(STATUS "It is not possible to build the mpi plugin inside "
81+
"the remote proxy device")
82+
list(REMOVE_ITEM LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD "mpi")
83+
endif()
84+
85+
message(STATUS "Building the MPI Plugin with support for remote offloading to "
86+
"the \"${LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD}\" plugins")
87+
88+
set(REMOTE_MPI_ENUM_PLUGIN_TARGETS "")
89+
foreach(plugin IN LISTS LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
90+
set(REMOTE_MPI_ENUM_PLUGIN_TARGETS
91+
"${REMOTE_MPI_ENUM_PLUGIN_TARGETS}PLUGIN_TARGET(${plugin})\n")
92+
endforeach()
93+
string(STRIP ${REMOTE_MPI_ENUM_PLUGIN_TARGETS} REMOTE_MPI_ENUM_PLUGIN_TARGETS)
94+
configure_file(
95+
${CMAKE_CURRENT_SOURCE_DIR}/src/RemoteTargets.def.in
96+
${LIBOMPTARGET_BINARY_INCLUDE_DIR}/Shared/RemoteTargets.def
97+
)
98+
99+
llvm_add_tool(OPENMP llvm-offload-mpi-proxy-device
100+
src/ProxyDevice.cpp
101+
src/RemotePluginManager.cpp
102+
${LIBOMPTARGET_SRC_DIR}/OpenMP/OMPT/Callback.cpp
103+
)
104+
105+
llvm_update_compile_flags(llvm-offload-mpi-proxy-device)
106+
107+
target_link_libraries(llvm-offload-mpi-proxy-device PRIVATE
108+
EventSystem
109+
LLVMSupport
110+
omp
111+
)
112+
113+
add_dependencies(llvm-offload-mpi-proxy-device omp)
114+
115+
target_include_directories(llvm-offload-mpi-proxy-device PRIVATE
116+
${LIBOMPTARGET_INCLUDE_DIR}
117+
${LIBOMPTARGET_LLVM_INCLUDE_DIRS}
118+
${LIBOMPTARGET_BINARY_INCLUDE_DIR}
119+
)
120+
121+
foreach(plugin IN LISTS LIBOMPTARGET_REMOTE_PLUGINS_TO_BUILD)
122+
target_link_libraries(llvm-offload-mpi-proxy-device PRIVATE omptarget.rtl.${plugin})
123+
add_dependencies(llvm-offload-mpi-proxy-device omptarget.rtl.${plugin})
124+
endforeach()
125+
126+
# Set C++20 as the target standard for this plugin.
127+
set_target_properties(llvm-offload-mpi-proxy-device
128+
PROPERTIES
129+
CXX_STANDARD 20
130+
CXX_STANDARD_REQUIRED ON)
131+
132+
target_compile_definitions(llvm-offload-mpi-proxy-device PRIVATE
133+
TARGET_NAME=llvm-offload-mpi-proxy-device
134+
DEBUG_PREFIX="MPIProxyDevice")
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
# Build EventSystem
2+
add_library(EventSystem OBJECT
3+
EventSystem.cpp
4+
)
5+
6+
target_include_directories(EventSystem PUBLIC
7+
${CMAKE_CURRENT_SOURCE_DIR}
8+
${LIBOMPTARGET_BINARY_INCLUDE_DIR}
9+
${LIBOMPTARGET_INCLUDE_DIR}
10+
)
11+
12+
target_link_libraries(EventSystem PRIVATE
13+
MPI::MPI_CXX
14+
LLVMSupport
15+
)
16+
17+
target_compile_options(EventSystem PUBLIC ${offload_compile_flags})
18+
target_link_options(EventSystem PUBLIC ${offload_link_flags})
19+
20+
set_target_properties(EventSystem PROPERTIES POSITION_INDEPENDENT_CODE ON)
21+
22+
# Set C++20 as the target standard for this plugin.
23+
set_target_properties(EventSystem
24+
PROPERTIES
25+
CXX_STANDARD 20
26+
CXX_STANDARD_REQUIRED ON)
27+
28+
target_compile_definitions(EventSystem PRIVATE
29+
DEBUG_PREFIX="EventSystem")

0 commit comments

Comments
 (0)