diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index e7527f842..4545174cc 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -29,10 +29,7 @@ ``` lsmod | grep nvidia_peermem ``` - * For GPU with nvls support, the IMEX channels should be set up (refer [cuMemCreate](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c)). You can set up the channels manually via: - ``` - sudo nvidia-modprobe -s -i - ``` + * For GPU with nvls support, we require the kernel version to be 5.6 or above. ## Build with Docker Images diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp index 914f32e8e..423c72187 100644 --- a/include/mscclpp/gpu.hpp +++ b/include/mscclpp/gpu.hpp @@ -101,10 +101,15 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri // NVLS #if !defined(__HIP_PLATFORM_AMD__) #include -// We need CU_MEM_HANDLE_TYPE_FABRIC (instroduced in cuda12.3) to support sharing handles across GPUs via sockets -#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))) -#else // !defined(__HIP_PLATFORM_AMD__) +#if CUDART_VERSION < 12030 +#define CU_MEM_HANDLE_TYPE_FABRIC ((CUmemAllocationHandleType)0x8ULL) +#endif +// We need CUDA 12.0 above and kernel 5.6.0 above for NVLS +#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12000) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0))) +#else // defined(__HIP_PLATFORM_AMD__) #define CUDA_NVLS_SUPPORTED 0 +// NVLS is not supported on AMD platform, just to avoid compilation error +#define CU_MEM_HANDLE_TYPE_FABRIC (0x8ULL) #endif // !defined(__HIP_PLATFORM_AMD__) // GPU sync threads diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp index 1d8730624..174f59540 100644 --- a/include/mscclpp/gpu_utils.hpp +++ b/include/mscclpp/gpu_utils.hpp @@ -63,6 +63,7 @@ void* gpuCallocHost(size_t bytes); void* gpuCallocUncached(size_t bytes); #endif // defined(__HIP_PLATFORM_AMD__) #if (CUDA_NVLS_SUPPORTED) +extern CUmemAllocationHandleType nvlsCompatibleMemHandleType; void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0); #endif // CUDA_NVLS_SUPPORTED diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc index 8fa9bf58b..afab9ee05 100644 --- a/src/gpu_utils.cc +++ b/src/gpu_utils.cc @@ -30,6 +30,8 @@ bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; } namespace detail { +CUmemAllocationHandleType nvlsCompatibleMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + /// set memory access permission to read-write /// @param base Base memory pointer. /// @param size Size of the memory. @@ -96,11 +98,17 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) { MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId)); MSCCLPP_CUTHROW(cuDeviceGet(¤tDevice, deviceId)); + int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + int isFabricSupported; + MSCCLPP_CUTHROW( + cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDevice)); + if (isFabricSupported) { + requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC; + } CUmemAllocationProp prop = {}; prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; - prop.requestedHandleTypes = - (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC); + prop.requestedHandleTypes = (CUmemAllocationHandleType)(requestedHandleTypes); prop.location.id = currentDevice; if (gran == 0) { @@ -110,7 +118,16 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) { // allocate physical memory CUmemGenericAllocationHandle memHandle; size_t nbytes = (bytes + gran - 1) / gran * gran; - MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/)); + CUresult result = cuMemCreate(&memHandle, nbytes, &prop, 0); + if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC && + (result == CUDA_ERROR_NOT_PERMITTED || result == CUDA_ERROR_NOT_SUPPORTED)) { + requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + prop.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes; + MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0)); + } else { + MSCCLPP_CUTHROW(result); + } + nvlsCompatibleMemHandleType = (CUmemAllocationHandleType)requestedHandleTypes; if (align == 0) { align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM); @@ -172,12 +189,10 @@ bool isNvlsSupported() { #if (CUDA_NVLS_SUPPORTED) if (!isChecked) { int isMulticastSupported; - int isFabricSupported; CUdevice dev; MSCCLPP_CUTHROW(cuCtxGetDevice(&dev)); MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev)); - MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev)); - result = (isMulticastSupported == 1 && isFabricSupported == 1); + return isMulticastSupported == 1; } return result; #endif diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp index 2f7727636..dd8ff24d2 100644 --- a/src/include/registered_memory.hpp +++ b/src/include/registered_memory.hpp @@ -28,7 +28,14 @@ struct TransportInfo { IbMrInfo ibMrInfo; }; struct { - char shareableHandle[64]; + union { + char shareableHandle[64]; + struct { + // These are only defined for multicast (NVLS) capability + pid_t rootPid; + int fileDesc; + }; + }; size_t offsetFromBase; }; }; @@ -47,6 +54,9 @@ struct RegisteredMemory::Impl { TransportFlags transports; std::vector transportInfos; + // For sharing memory handle via file descriptor + int fileDesc = -1; + Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl); /// Constructs a RegisteredMemory::Impl from a vector of data. The constructor should only be used for the remote /// memory. diff --git a/src/registered_memory.cc b/src/registered_memory.cc index 72fe3aa5f..b10f46c40 100644 --- a/src/registered_memory.cc +++ b/src/registered_memory.cc @@ -3,6 +3,9 @@ #include "registered_memory.hpp" +#include +#include + #include #include @@ -24,9 +27,13 @@ } while (false) namespace { -CUmemAllocationHandleType getNvlsCompatibleMemHandleType() { +CUmemAllocationHandleType getNvlsMemHandleType() { #if (CUDA_NVLS_SUPPORTED) - return CU_MEM_HANDLE_TYPE_FABRIC; + if (mscclpp::detail::nvlsCompatibleMemHandleType & CU_MEM_HANDLE_TYPE_FABRIC) { + return CU_MEM_HANDLE_TYPE_FABRIC; + } else { + return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR; + } #else throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage); #endif @@ -72,8 +79,16 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports, if (this->isCuMemMapAlloc) { CUmemGenericAllocationHandle handle; MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, baseDataPtr)); - MSCCLPP_CUTHROW( - cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsCompatibleMemHandleType(), 0)); + if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) { + MSCCLPP_CUTHROW(cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsMemHandleType(), 0)); + } else { + transportInfo.rootPid = getpid(); + if (transportInfo.rootPid < 0) { + throw mscclpp::SysError("getpid() failed", errno); + } + MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesc, handle, getNvlsMemHandleType(), 0)); + this->fileDesc = transportInfo.fileDesc; + } transportInfo.offsetFromBase = (char*)data - (char*)baseDataPtr; MSCCLPP_CUTHROW(cuMemRelease(handle)); } else { @@ -138,8 +153,13 @@ MSCCLPP_API_CPP std::vector RegisteredMemory::serialize() { std::copy_n(reinterpret_cast(&entry.transport), sizeof(entry.transport), std::back_inserter(result)); if (entry.transport == Transport::CudaIpc) { if (pimpl_->isCuMemMapAlloc) { - std::copy_n(reinterpret_cast(&entry.shareableHandle), sizeof(entry.shareableHandle), - std::back_inserter(result)); + if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) { + std::copy_n(reinterpret_cast(&entry.shareableHandle), sizeof(entry.shareableHandle), + std::back_inserter(result)); + } else { + std::copy_n(reinterpret_cast(&entry.rootPid), sizeof(entry.rootPid), std::back_inserter(result)); + std::copy_n(reinterpret_cast(&entry.fileDesc), sizeof(entry.fileDesc), std::back_inserter(result)); + } std::copy_n(reinterpret_cast(&entry.offsetFromBase), sizeof(entry.offsetFromBase), std::back_inserter(result)); } else { @@ -184,8 +204,16 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { it += sizeof(transportInfo.transport); if (transportInfo.transport == Transport::CudaIpc) { if (this->isCuMemMapAlloc) { - std::copy_n(it, sizeof(transportInfo.shareableHandle), reinterpret_cast(&transportInfo.shareableHandle)); - it += sizeof(transportInfo.shareableHandle); + if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) { + std::copy_n(it, sizeof(transportInfo.shareableHandle), + reinterpret_cast(&transportInfo.shareableHandle)); + it += sizeof(transportInfo.shareableHandle); + } else { + std::copy_n(it, sizeof(transportInfo.rootPid), reinterpret_cast(&transportInfo.rootPid)); + it += sizeof(transportInfo.rootPid); + std::copy_n(it, sizeof(transportInfo.fileDesc), reinterpret_cast(&transportInfo.fileDesc)); + it += sizeof(transportInfo.fileDesc); + } std::copy_n(it, sizeof(transportInfo.offsetFromBase), reinterpret_cast(&transportInfo.offsetFromBase)); it += sizeof(transportInfo.offsetFromBase); } else { @@ -220,7 +248,23 @@ RegisteredMemory::Impl::Impl(const std::vector& serialization) { if (this->isCuMemMapAlloc) { #if (CUDA_NVLS_SUPPORTED) CUmemGenericAllocationHandle handle; - MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsCompatibleMemHandleType())); + if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) { + MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsMemHandleType())); + } else { + int rootPidFd = syscall(SYS_pidfd_open, entry.rootPid, 0); + if (rootPidFd < 0) { + throw mscclpp::SysError("pidfd_open() failed", errno); + } + int fd = syscall(SYS_pidfd_getfd, rootPidFd, entry.fileDesc, 0); + if (fd < 0) { + throw mscclpp::SysError("pidfd_getfd() failed", errno); + } + INFO(MSCCLPP_P2P, "Get file descriptor %d from pidfd %d on peer 0x%lx", fd, rootPidFd, hostHash); + MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, reinterpret_cast(fd), + CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR)); + close(rootPidFd); + close(fd); + } size_t minGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_MINIMUM); size_t recommendedGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_RECOMMENDED); size_t size = (this->size + recommendedGran - 1) / recommendedGran * recommendedGran; @@ -257,6 +301,9 @@ RegisteredMemory::Impl::~Impl() { MSCCLPP_CULOG_WARN(cuMemUnmap((CUdeviceptr)base, size)); MSCCLPP_CULOG_WARN(cuMemRelease(handle)); MSCCLPP_CULOG_WARN(cuMemAddressFree((CUdeviceptr)base, size)); + if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR && fileDesc >= 0) { + close(fileDesc); + } } else { cudaError_t err = cudaIpcCloseMemHandle(base); if (err != cudaSuccess) { @@ -266,6 +313,7 @@ RegisteredMemory::Impl::~Impl() { } } data = nullptr; + fileDesc = -1; } }