Skip to content

Commit a3d8d68

Browse files
authored
Remove the requirement for CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED for NVLS support (#489)
Remove the requirement for `CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED` for NVLS support. Fix #487
1 parent 0f21ed4 commit a3d8d68

File tree

6 files changed

+99
-23
lines changed

6 files changed

+99
-23
lines changed

docs/getting-started/quickstart.md

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -29,10 +29,7 @@
2929
```
3030
lsmod | grep nvidia_peermem
3131
```
32-
* For GPU with nvls support, the IMEX channels should be set up (refer [cuMemCreate](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c)). You can set up the channels manually via:
33-
```
34-
sudo nvidia-modprobe -s -i <start:number of minors>
35-
```
32+
* For GPU with nvls support, we require the kernel version to be 5.6 or above.
3633
3734
## Build with Docker Images
3835

include/mscclpp/gpu.hpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -101,10 +101,15 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
101101
// NVLS
102102
#if !defined(__HIP_PLATFORM_AMD__)
103103
#include <linux/version.h>
104-
// We need CU_MEM_HANDLE_TYPE_FABRIC (instroduced in cuda12.3) to support sharing handles across GPUs via sockets
105-
#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
106-
#else // !defined(__HIP_PLATFORM_AMD__)
104+
#if CUDART_VERSION < 12030
105+
#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
106+
#endif
107+
// We need CUDA 12.0 above and kernel 5.6.0 above for NVLS
108+
#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12000) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
109+
#else // defined(__HIP_PLATFORM_AMD__)
107110
#define CUDA_NVLS_SUPPORTED 0
111+
// NVLS is not supported on AMD platform, just to avoid compilation error
112+
#define CU_MEM_HANDLE_TYPE_FABRIC (0x8ULL)
108113
#endif // !defined(__HIP_PLATFORM_AMD__)
109114

110115
// GPU sync threads

include/mscclpp/gpu_utils.hpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@ void* gpuCallocHost(size_t bytes);
6363
void* gpuCallocUncached(size_t bytes);
6464
#endif // defined(__HIP_PLATFORM_AMD__)
6565
#if (CUDA_NVLS_SUPPORTED)
66+
extern CUmemAllocationHandleType nvlsCompatibleMemHandleType;
6667
void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0);
6768
#endif // CUDA_NVLS_SUPPORTED
6869

src/gpu_utils.cc

Lines changed: 21 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,8 @@ bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; }
3030

3131
namespace detail {
3232

33+
CUmemAllocationHandleType nvlsCompatibleMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
34+
3335
/// set memory access permission to read-write
3436
/// @param base Base memory pointer.
3537
/// @param size Size of the memory.
@@ -96,11 +98,17 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
9698
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
9799
MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
98100

101+
int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
102+
int isFabricSupported;
103+
MSCCLPP_CUTHROW(
104+
cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDevice));
105+
if (isFabricSupported) {
106+
requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
107+
}
99108
CUmemAllocationProp prop = {};
100109
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
101110
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
102-
prop.requestedHandleTypes =
103-
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
111+
prop.requestedHandleTypes = (CUmemAllocationHandleType)(requestedHandleTypes);
104112
prop.location.id = currentDevice;
105113

106114
if (gran == 0) {
@@ -110,7 +118,16 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
110118
// allocate physical memory
111119
CUmemGenericAllocationHandle memHandle;
112120
size_t nbytes = (bytes + gran - 1) / gran * gran;
113-
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
121+
CUresult result = cuMemCreate(&memHandle, nbytes, &prop, 0);
122+
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC &&
123+
(result == CUDA_ERROR_NOT_PERMITTED || result == CUDA_ERROR_NOT_SUPPORTED)) {
124+
requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
125+
prop.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes;
126+
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0));
127+
} else {
128+
MSCCLPP_CUTHROW(result);
129+
}
130+
nvlsCompatibleMemHandleType = (CUmemAllocationHandleType)requestedHandleTypes;
114131

115132
if (align == 0) {
116133
align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
@@ -172,12 +189,10 @@ bool isNvlsSupported() {
172189
#if (CUDA_NVLS_SUPPORTED)
173190
if (!isChecked) {
174191
int isMulticastSupported;
175-
int isFabricSupported;
176192
CUdevice dev;
177193
MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
178194
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
179-
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
180-
result = (isMulticastSupported == 1 && isFabricSupported == 1);
195+
return isMulticastSupported == 1;
181196
}
182197
return result;
183198
#endif

src/include/registered_memory.hpp

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,14 @@ struct TransportInfo {
2828
IbMrInfo ibMrInfo;
2929
};
3030
struct {
31-
char shareableHandle[64];
31+
union {
32+
char shareableHandle[64];
33+
struct {
34+
// These are only defined for multicast (NVLS) capability
35+
pid_t rootPid;
36+
int fileDesc;
37+
};
38+
};
3239
size_t offsetFromBase;
3340
};
3441
};
@@ -47,6 +54,9 @@ struct RegisteredMemory::Impl {
4754
TransportFlags transports;
4855
std::vector<TransportInfo> transportInfos;
4956

57+
// For sharing memory handle via file descriptor
58+
int fileDesc = -1;
59+
5060
Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl);
5161
/// Constructs a RegisteredMemory::Impl from a vector of data. The constructor should only be used for the remote
5262
/// memory.

src/registered_memory.cc

Lines changed: 57 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33

44
#include "registered_memory.hpp"
55

6+
#include <sys/syscall.h>
7+
#include <unistd.h>
8+
69
#include <algorithm>
710
#include <mscclpp/gpu_utils.hpp>
811

@@ -24,9 +27,13 @@
2427
} while (false)
2528

2629
namespace {
27-
CUmemAllocationHandleType getNvlsCompatibleMemHandleType() {
30+
CUmemAllocationHandleType getNvlsMemHandleType() {
2831
#if (CUDA_NVLS_SUPPORTED)
29-
return CU_MEM_HANDLE_TYPE_FABRIC;
32+
if (mscclpp::detail::nvlsCompatibleMemHandleType & CU_MEM_HANDLE_TYPE_FABRIC) {
33+
return CU_MEM_HANDLE_TYPE_FABRIC;
34+
} else {
35+
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
36+
}
3037
#else
3138
throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
3239
#endif
@@ -72,8 +79,16 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
7279
if (this->isCuMemMapAlloc) {
7380
CUmemGenericAllocationHandle handle;
7481
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, baseDataPtr));
75-
MSCCLPP_CUTHROW(
76-
cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsCompatibleMemHandleType(), 0));
82+
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
83+
MSCCLPP_CUTHROW(cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsMemHandleType(), 0));
84+
} else {
85+
transportInfo.rootPid = getpid();
86+
if (transportInfo.rootPid < 0) {
87+
throw mscclpp::SysError("getpid() failed", errno);
88+
}
89+
MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesc, handle, getNvlsMemHandleType(), 0));
90+
this->fileDesc = transportInfo.fileDesc;
91+
}
7792
transportInfo.offsetFromBase = (char*)data - (char*)baseDataPtr;
7893
MSCCLPP_CUTHROW(cuMemRelease(handle));
7994
} else {
@@ -138,8 +153,13 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
138153
std::copy_n(reinterpret_cast<char*>(&entry.transport), sizeof(entry.transport), std::back_inserter(result));
139154
if (entry.transport == Transport::CudaIpc) {
140155
if (pimpl_->isCuMemMapAlloc) {
141-
std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
142-
std::back_inserter(result));
156+
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
157+
std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
158+
std::back_inserter(result));
159+
} else {
160+
std::copy_n(reinterpret_cast<char*>(&entry.rootPid), sizeof(entry.rootPid), std::back_inserter(result));
161+
std::copy_n(reinterpret_cast<char*>(&entry.fileDesc), sizeof(entry.fileDesc), std::back_inserter(result));
162+
}
143163
std::copy_n(reinterpret_cast<char*>(&entry.offsetFromBase), sizeof(entry.offsetFromBase),
144164
std::back_inserter(result));
145165
} else {
@@ -184,8 +204,16 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
184204
it += sizeof(transportInfo.transport);
185205
if (transportInfo.transport == Transport::CudaIpc) {
186206
if (this->isCuMemMapAlloc) {
187-
std::copy_n(it, sizeof(transportInfo.shareableHandle), reinterpret_cast<char*>(&transportInfo.shareableHandle));
188-
it += sizeof(transportInfo.shareableHandle);
207+
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
208+
std::copy_n(it, sizeof(transportInfo.shareableHandle),
209+
reinterpret_cast<char*>(&transportInfo.shareableHandle));
210+
it += sizeof(transportInfo.shareableHandle);
211+
} else {
212+
std::copy_n(it, sizeof(transportInfo.rootPid), reinterpret_cast<char*>(&transportInfo.rootPid));
213+
it += sizeof(transportInfo.rootPid);
214+
std::copy_n(it, sizeof(transportInfo.fileDesc), reinterpret_cast<char*>(&transportInfo.fileDesc));
215+
it += sizeof(transportInfo.fileDesc);
216+
}
189217
std::copy_n(it, sizeof(transportInfo.offsetFromBase), reinterpret_cast<char*>(&transportInfo.offsetFromBase));
190218
it += sizeof(transportInfo.offsetFromBase);
191219
} else {
@@ -220,7 +248,23 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
220248
if (this->isCuMemMapAlloc) {
221249
#if (CUDA_NVLS_SUPPORTED)
222250
CUmemGenericAllocationHandle handle;
223-
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsCompatibleMemHandleType()));
251+
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
252+
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsMemHandleType()));
253+
} else {
254+
int rootPidFd = syscall(SYS_pidfd_open, entry.rootPid, 0);
255+
if (rootPidFd < 0) {
256+
throw mscclpp::SysError("pidfd_open() failed", errno);
257+
}
258+
int fd = syscall(SYS_pidfd_getfd, rootPidFd, entry.fileDesc, 0);
259+
if (fd < 0) {
260+
throw mscclpp::SysError("pidfd_getfd() failed", errno);
261+
}
262+
INFO(MSCCLPP_P2P, "Get file descriptor %d from pidfd %d on peer 0x%lx", fd, rootPidFd, hostHash);
263+
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, reinterpret_cast<void*>(fd),
264+
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
265+
close(rootPidFd);
266+
close(fd);
267+
}
224268
size_t minGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_MINIMUM);
225269
size_t recommendedGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_RECOMMENDED);
226270
size_t size = (this->size + recommendedGran - 1) / recommendedGran * recommendedGran;
@@ -257,6 +301,9 @@ RegisteredMemory::Impl::~Impl() {
257301
MSCCLPP_CULOG_WARN(cuMemUnmap((CUdeviceptr)base, size));
258302
MSCCLPP_CULOG_WARN(cuMemRelease(handle));
259303
MSCCLPP_CULOG_WARN(cuMemAddressFree((CUdeviceptr)base, size));
304+
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR && fileDesc >= 0) {
305+
close(fileDesc);
306+
}
260307
} else {
261308
cudaError_t err = cudaIpcCloseMemHandle(base);
262309
if (err != cudaSuccess) {
@@ -266,6 +313,7 @@ RegisteredMemory::Impl::~Impl() {
266313
}
267314
}
268315
data = nullptr;
316+
fileDesc = -1;
269317
}
270318
}
271319

0 commit comments

Comments
 (0)