Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 1 addition & 4 deletions docs/getting-started/quickstart.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,7 @@
```
lsmod | grep nvidia_peermem
```
* For GPU with nvls support, the IMEX channels should be set up (refer [cuMemCreate](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c)). You can set up the channels manually via:
```
sudo nvidia-modprobe -s -i <start:number of minors>
```
* For GPU with nvls support, we require the kernel version to be 5.6 or above.

## Build with Docker Images

Expand Down
4 changes: 2 additions & 2 deletions include/mscclpp/gpu.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,8 +101,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
// NVLS
#if !defined(__HIP_PLATFORM_AMD__)
#include <linux/version.h>
// We need CU_MEM_HANDLE_TYPE_FABRIC (instroduced in cuda12.3) to support sharing handles across GPUs via sockets
#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
// We need CUDA 12.0 above and kernel 5.6.0 above for NVLS
#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12000) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
#else // !defined(__HIP_PLATFORM_AMD__)
#define CUDA_NVLS_SUPPORTED 0
#endif // !defined(__HIP_PLATFORM_AMD__)
Expand Down
1 change: 1 addition & 0 deletions include/mscclpp/gpu_utils.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,6 +63,7 @@ void* gpuCallocHost(size_t bytes);
void* gpuCallocUncached(size_t bytes);
#endif // defined(__HIP_PLATFORM_AMD__)
#if (CUDA_NVLS_SUPPORTED)
extern CUmemAllocationHandleType nvlsCompatibleMemHandleType;
void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0);
#endif // CUDA_NVLS_SUPPORTED

Expand Down
26 changes: 21 additions & 5 deletions src/gpu_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; }

namespace detail {

CUmemAllocationHandleType nvlsCompatibleMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;

/// set memory access permission to read-write
/// @param base Base memory pointer.
/// @param size Size of the memory.
Expand Down Expand Up @@ -96,11 +98,18 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));

int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
int isFabricSupported;
MSCCLPP_CUTHROW(
cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDevice));
if (isFabricSupported) {
requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
}
CUmemAllocationProp prop = {};
prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
prop.requestedHandleTypes =
(CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
(CUmemAllocationHandleType)(requestedHandleTypes);
prop.location.id = currentDevice;

if (gran == 0) {
Expand All @@ -110,7 +119,16 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
// allocate physical memory
CUmemGenericAllocationHandle memHandle;
size_t nbytes = (bytes + gran - 1) / gran * gran;
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
CUresult result = cuMemCreate(&memHandle, nbytes, &prop, 0);
if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC &&
(result == CUDA_ERROR_NOT_PERMITTED || result == CUDA_ERROR_NOT_SUPPORTED)) {
requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
prop.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes;
MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0));
} else {
MSCCLPP_CUTHROW(result);
}
nvlsCompatibleMemHandleType = (CUmemAllocationHandleType)requestedHandleTypes;

if (align == 0) {
align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
Expand Down Expand Up @@ -172,12 +190,10 @@ bool isNvlsSupported() {
#if (CUDA_NVLS_SUPPORTED)
if (!isChecked) {
int isMulticastSupported;
int isFabricSupported;
CUdevice dev;
MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
result = (isMulticastSupported == 1 && isFabricSupported == 1);
return isMulticastSupported == 1;
}
return result;
#endif
Expand Down
12 changes: 11 additions & 1 deletion src/include/registered_memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,14 @@ struct TransportInfo {
IbMrInfo ibMrInfo;
};
struct {
char shareableHandle[64];
union {
char shareableHandle[64];
struct {
// These are only defined for multicast (NVLS) capability
pid_t rootPid;
int fileDesc;
};
};
size_t offsetFromBase;
};
};
Expand All @@ -47,6 +54,9 @@ struct RegisteredMemory::Impl {
TransportFlags transports;
std::vector<TransportInfo> transportInfos;

// For sharing memory handle via file descriptor
int fileDesc = -1;

Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl);
/// Constructs a RegisteredMemory::Impl from a vector of data. The constructor should only be used for the remote
/// memory.
Expand Down
66 changes: 57 additions & 9 deletions src/registered_memory.cc
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@

#include "registered_memory.hpp"

#include <sys/syscall.h>
#include <unistd.h>

#include <algorithm>
#include <mscclpp/gpu_utils.hpp>

Expand All @@ -24,9 +27,13 @@
} while (false)

namespace {
CUmemAllocationHandleType getNvlsCompatibleMemHandleType() {
CUmemAllocationHandleType getNvlsMemHandleType() {
#if (CUDA_NVLS_SUPPORTED)
return CU_MEM_HANDLE_TYPE_FABRIC;
if (mscclpp::detail::nvlsCompatibleMemHandleType & CU_MEM_HANDLE_TYPE_FABRIC) {
return CU_MEM_HANDLE_TYPE_FABRIC;
} else {
return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
}
#else
throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
#endif
Expand Down Expand Up @@ -72,8 +79,16 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
if (this->isCuMemMapAlloc) {
CUmemGenericAllocationHandle handle;
MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, baseDataPtr));
MSCCLPP_CUTHROW(
cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsCompatibleMemHandleType(), 0));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
MSCCLPP_CUTHROW(cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsMemHandleType(), 0));
} else {
transportInfo.rootPid = getpid();
if (transportInfo.rootPid < 0) {
throw mscclpp::SysError("getpid() failed", errno);
}
MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesc, handle, getNvlsMemHandleType(), 0));
fileDesc = transportInfo.fileDesc;
}
transportInfo.offsetFromBase = (char*)data - (char*)baseDataPtr;
MSCCLPP_CUTHROW(cuMemRelease(handle));
} else {
Expand Down Expand Up @@ -138,8 +153,13 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
std::copy_n(reinterpret_cast<char*>(&entry.transport), sizeof(entry.transport), std::back_inserter(result));
if (entry.transport == Transport::CudaIpc) {
if (pimpl_->isCuMemMapAlloc) {
std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
std::back_inserter(result));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
std::back_inserter(result));
} else {
std::copy_n(reinterpret_cast<char*>(&entry.rootPid), sizeof(entry.rootPid), std::back_inserter(result));
std::copy_n(reinterpret_cast<char*>(&entry.fileDesc), sizeof(entry.fileDesc), std::back_inserter(result));
}
std::copy_n(reinterpret_cast<char*>(&entry.offsetFromBase), sizeof(entry.offsetFromBase),
std::back_inserter(result));
} else {
Expand Down Expand Up @@ -184,8 +204,16 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
it += sizeof(transportInfo.transport);
if (transportInfo.transport == Transport::CudaIpc) {
if (this->isCuMemMapAlloc) {
std::copy_n(it, sizeof(transportInfo.shareableHandle), reinterpret_cast<char*>(&transportInfo.shareableHandle));
it += sizeof(transportInfo.shareableHandle);
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
std::copy_n(it, sizeof(transportInfo.shareableHandle),
reinterpret_cast<char*>(&transportInfo.shareableHandle));
it += sizeof(transportInfo.shareableHandle);
} else {
std::copy_n(it, sizeof(transportInfo.rootPid), reinterpret_cast<char*>(&transportInfo.rootPid));
it += sizeof(transportInfo.rootPid);
std::copy_n(it, sizeof(transportInfo.fileDesc), reinterpret_cast<char*>(&transportInfo.fileDesc));
it += sizeof(transportInfo.fileDesc);
}
std::copy_n(it, sizeof(transportInfo.offsetFromBase), reinterpret_cast<char*>(&transportInfo.offsetFromBase));
it += sizeof(transportInfo.offsetFromBase);
} else {
Expand Down Expand Up @@ -220,7 +248,23 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
if (this->isCuMemMapAlloc) {
#if (CUDA_NVLS_SUPPORTED)
CUmemGenericAllocationHandle handle;
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsCompatibleMemHandleType()));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsMemHandleType()));
} else {
int rootPidFd = syscall(SYS_pidfd_open, entry.rootPid, 0);
if (rootPidFd < 0) {
throw mscclpp::SysError("pidfd_open() failed", errno);
}
int fileDesc = syscall(SYS_pidfd_getfd, rootPidFd, entry.fileDesc, 0);
if (fileDesc < 0) {
throw mscclpp::SysError("pidfd_getfd() failed", errno);
}
INFO(MSCCLPP_P2P, "Get file descriptor %d from pidfd %d on peer 0x%lx", fileDesc, rootPidFd, hostHash);
MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, reinterpret_cast<void*>(fileDesc),
CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
close(rootPidFd);
close(fileDesc);
}
size_t minGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_MINIMUM);
size_t recommendedGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_RECOMMENDED);
size_t size = (this->size + recommendedGran - 1) / recommendedGran * recommendedGran;
Expand Down Expand Up @@ -257,6 +301,9 @@ RegisteredMemory::Impl::~Impl() {
MSCCLPP_CULOG_WARN(cuMemUnmap((CUdeviceptr)base, size));
MSCCLPP_CULOG_WARN(cuMemRelease(handle));
MSCCLPP_CULOG_WARN(cuMemAddressFree((CUdeviceptr)base, size));
if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR && fileDesc >= 0) {
close(fileDesc);
}
} else {
cudaError_t err = cudaIpcCloseMemHandle(base);
if (err != cudaSuccess) {
Expand All @@ -266,6 +313,7 @@ RegisteredMemory::Impl::~Impl() {
}
}
data = nullptr;
fileDesc = -1;
}
}

Expand Down
Loading