Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions docs/getting-started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@
```
lsmod | grep nvidia_peermem
```
* For GPU with nvls support, the IMEX channels should be set up (refer [cuMemCreate](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c)). You can set up the channels manually via:
```
sudo nvidia-modprobe -s -i <start:number of minors>
```
* For GPU with nvls support, we require the kernel version to be 5.6 or above.

## Build with Docker Images

Expand Down
11 changes: 8 additions & 3 deletions include/mscclpp/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,15 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
// NVLS
#if !defined(__HIP_PLATFORM_AMD__)
#include <linux/version.h>
// We need CU_MEM_HANDLE_TYPE_FABRIC (instroduced in cuda12.3) to support sharing handles across GPUs via sockets
#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
#else // !defined(__HIP_PLATFORM_AMD__)
#if CUDART_VERSION < 12030
#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL)
#endif
// We need CUDA 12.0 above and kernel 5.6.0 above for NVLS
#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12000) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
#else // defined(__HIP_PLATFORM_AMD__)
#define CUDA_NVLS_SUPPORTED 0
// NVLS is not supported on AMD platform, just to avoid compilation error
#define CU_MEM_HANDLE_TYPE_FABRIC (0x8ULL)
#endif // !defined(__HIP_PLATFORM_AMD__)

// GPU sync threads
Expand Down
1 change: 1 addition & 0 deletions include/mscclpp/gpu_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ void* gpuCallocHost(size_t bytes);
void* gpuCallocUncached(size_t bytes);
#endif // defined(__HIP_PLATFORM_AMD__)
#if (CUDA_NVLS_SUPPORTED)
extern CUmemAllocationHandleType nvlsCompatibleMemHandleType;
void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0);
#endif // CUDA_NVLS_SUPPORTED

Expand Down
27 changes: 21 additions & 6 deletions src/gpu_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; }

namespace detail {

CUmemAllocationHandleType nvlsCompatibleMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;

/// set memory access permission to read-write
/// @param base Base memory pointer.
/// @param size Size of the memory.
Expand Down Expand Up @@ -96,11 +98,17 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));

int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
int isFabricSupported;
MSCCLPP_CUTHROW(
cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDevice));
if (isFabricSupported) {
requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
}
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes =
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
prop.requestedHandleTypes = (CUmemAllocationHandleType)(requestedHandleTypes);
prop.location.id = currentDevice;

if (gran == 0) {
Expand All @@ -110,7 +118,16 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
// allocate physical memory
CUmemGenericAllocationHandle memHandle;
size_t nbytes = (bytes + gran - 1) / gran * gran;
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
CUresult result = cuMemCreate(&memHandle, nbytes, &prop, 0);
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC &&
(result == CUDA_ERROR_NOT_PERMITTED || result == CUDA_ERROR_NOT_SUPPORTED)) {
requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
prop.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes;
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0));
} else {
MSCCLPP_CUTHROW(result);
}
nvlsCompatibleMemHandleType = (CUmemAllocationHandleType)requestedHandleTypes;

if (align == 0) {
align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
Expand Down Expand Up @@ -172,12 +189,10 @@ bool isNvlsSupported() {
#if (CUDA_NVLS_SUPPORTED)
if (!isChecked) {
int isMulticastSupported;
int isFabricSupported;
CUdevice dev;
MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
result = (isMulticastSupported == 1 && isFabricSupported == 1);
return isMulticastSupported == 1;
}
return result;
#endif
Expand Down
12 changes: 11 additions & 1 deletion src/include/registered_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,14 @@ struct TransportInfo {
IbMrInfo ibMrInfo;
};
struct {
char shareableHandle[64];
union {
char shareableHandle[64];
struct {
// These are only defined for multicast (NVLS) capability
pid_t rootPid;
int fileDesc;
};
};
size_t offsetFromBase;
};
};
Expand All @@ -47,6 +54,9 @@ struct RegisteredMemory::Impl {
TransportFlags transports;
std::vector<TransportInfo> transportInfos;

// For sharing memory handle via file descriptor
int fileDesc = -1;

Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl);
/// Constructs a RegisteredMemory::Impl from a vector of data. The constructor should only be used for the remote
/// memory.
Expand Down
66 changes: 57 additions & 9 deletions src/registered_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

#include "registered_memory.hpp"

#include <sys/syscall.h>
#include <unistd.h>

#include <algorithm>
#include <mscclpp/gpu_utils.hpp>

Expand All @@ -24,9 +27,13 @@
} while (false)

namespace {
CUmemAllocationHandleType getNvlsCompatibleMemHandleType() {
CUmemAllocationHandleType getNvlsMemHandleType() {
#if (CUDA_NVLS_SUPPORTED)
return CU_MEM_HANDLE_TYPE_FABRIC;
if (mscclpp::detail::nvlsCompatibleMemHandleType & CU_MEM_HANDLE_TYPE_FABRIC) {
return CU_MEM_HANDLE_TYPE_FABRIC;
} else {
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
}
#else
throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
#endif
Expand Down Expand Up @@ -72,8 +79,16 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
if (this->isCuMemMapAlloc) {
CUmemGenericAllocationHandle handle;
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, baseDataPtr));
MSCCLPP_CUTHROW(
cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsCompatibleMemHandleType(), 0));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
MSCCLPP_CUTHROW(cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsMemHandleType(), 0));
} else {
transportInfo.rootPid = getpid();
if (transportInfo.rootPid < 0) {
throw mscclpp::SysError("getpid() failed", errno);
}
MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesc, handle, getNvlsMemHandleType(), 0));
this->fileDesc = transportInfo.fileDesc;
}
transportInfo.offsetFromBase = (char*)data - (char*)baseDataPtr;
MSCCLPP_CUTHROW(cuMemRelease(handle));
} else {
Expand Down Expand Up @@ -138,8 +153,13 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
std::copy_n(reinterpret_cast<char*>(&entry.transport), sizeof(entry.transport), std::back_inserter(result));
if (entry.transport == Transport::CudaIpc) {
if (pimpl_->isCuMemMapAlloc) {
std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
std::back_inserter(result));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
std::back_inserter(result));
} else {
std::copy_n(reinterpret_cast<char*>(&entry.rootPid), sizeof(entry.rootPid), std::back_inserter(result));
std::copy_n(reinterpret_cast<char*>(&entry.fileDesc), sizeof(entry.fileDesc), std::back_inserter(result));
}
std::copy_n(reinterpret_cast<char*>(&entry.offsetFromBase), sizeof(entry.offsetFromBase),
std::back_inserter(result));
} else {
Expand Down Expand Up @@ -184,8 +204,16 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
it += sizeof(transportInfo.transport);
if (transportInfo.transport == Transport::CudaIpc) {
if (this->isCuMemMapAlloc) {
std::copy_n(it, sizeof(transportInfo.shareableHandle), reinterpret_cast<char*>(&transportInfo.shareableHandle));
it += sizeof(transportInfo.shareableHandle);
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
std::copy_n(it, sizeof(transportInfo.shareableHandle),
reinterpret_cast<char*>(&transportInfo.shareableHandle));
it += sizeof(transportInfo.shareableHandle);
} else {
std::copy_n(it, sizeof(transportInfo.rootPid), reinterpret_cast<char*>(&transportInfo.rootPid));
it += sizeof(transportInfo.rootPid);
std::copy_n(it, sizeof(transportInfo.fileDesc), reinterpret_cast<char*>(&transportInfo.fileDesc));
it += sizeof(transportInfo.fileDesc);
}
std::copy_n(it, sizeof(transportInfo.offsetFromBase), reinterpret_cast<char*>(&transportInfo.offsetFromBase));
it += sizeof(transportInfo.offsetFromBase);
} else {
Expand Down Expand Up @@ -220,7 +248,23 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
if (this->isCuMemMapAlloc) {
#if (CUDA_NVLS_SUPPORTED)
CUmemGenericAllocationHandle handle;
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsCompatibleMemHandleType()));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsMemHandleType()));
} else {
int rootPidFd = syscall(SYS_pidfd_open, entry.rootPid, 0);
if (rootPidFd < 0) {
throw mscclpp::SysError("pidfd_open() failed", errno);
}
int fd = syscall(SYS_pidfd_getfd, rootPidFd, entry.fileDesc, 0);
if (fd < 0) {
throw mscclpp::SysError("pidfd_getfd() failed", errno);
}
INFO(MSCCLPP_P2P, "Get file descriptor %d from pidfd %d on peer 0x%lx", fd, rootPidFd, hostHash);
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, reinterpret_cast<void*>(fd),
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
close(rootPidFd);
close(fd);
}
size_t minGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_MINIMUM);
size_t recommendedGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_RECOMMENDED);
size_t size = (this->size + recommendedGran - 1) / recommendedGran * recommendedGran;
Expand Down Expand Up @@ -257,6 +301,9 @@ RegisteredMemory::Impl::~Impl() {
MSCCLPP_CULOG_WARN(cuMemUnmap((CUdeviceptr)base, size));
MSCCLPP_CULOG_WARN(cuMemRelease(handle));
MSCCLPP_CULOG_WARN(cuMemAddressFree((CUdeviceptr)base, size));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR && fileDesc >= 0) {
close(fileDesc);
}
} else {
cudaError_t err = cudaIpcCloseMemHandle(base);
if (err != cudaSuccess) {
Expand All @@ -266,6 +313,7 @@ RegisteredMemory::Impl::~Impl() {
}
}
data = nullptr;
fileDesc = -1;
}
}

Expand Down
Loading