microsoft · Binyang2014 · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025 · Mar 28, 2025
diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md
@@ -29,10 +29,7 @@
         ```
         lsmod | grep nvidia_peermem
         ```
-    * For GPU with nvls support, the IMEX channels should be set up (refer [cuMemCreate](https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__VA.html#group__CUDA__VA_1g899d69a862bba36449789c64b430dc7c)). You can set up the channels manually via:
-        ```
-        sudo nvidia-modprobe -s -i <start:number of minors>
-        ```
+    * For GPU with nvls support, we require the kernel version to be 5.6 or above.
 
 ## Build with Docker Images
 

diff --git a/include/mscclpp/gpu.hpp b/include/mscclpp/gpu.hpp
@@ -101,8 +101,8 @@ constexpr auto CU_MEM_ACCESS_FLAGS_PROT_READWRITE = hipMemAccessFlagsProtReadWri
 // NVLS
 #if !defined(__HIP_PLATFORM_AMD__)
 #include <linux/version.h>
-// We need CU_MEM_HANDLE_TYPE_FABRIC (instroduced in cuda12.3) to support sharing handles across GPUs via sockets
-#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12030) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
+// We need CUDA 12.0 above and kernel 5.6.0 above for NVLS
+#define CUDA_NVLS_SUPPORTED ((CUDART_VERSION >= 12000) && (LINUX_VERSION_CODE >= KERNEL_VERSION(5, 6, 0)))
 #else  // !defined(__HIP_PLATFORM_AMD__)
 #define CUDA_NVLS_SUPPORTED 0
 #endif  // !defined(__HIP_PLATFORM_AMD__)

diff --git a/include/mscclpp/gpu_utils.hpp b/include/mscclpp/gpu_utils.hpp
@@ -63,6 +63,7 @@ void* gpuCallocHost(size_t bytes);
 void* gpuCallocUncached(size_t bytes);
 #endif  // defined(__HIP_PLATFORM_AMD__)
 #if (CUDA_NVLS_SUPPORTED)
+extern CUmemAllocationHandleType nvlsCompatibleMemHandleType;
 void* gpuCallocPhysical(size_t bytes, size_t gran = 0, size_t align = 0);
 #endif  // CUDA_NVLS_SUPPORTED
 

diff --git a/src/gpu_utils.cc b/src/gpu_utils.cc
@@ -30,6 +30,8 @@ bool CudaStreamWithFlags::empty() const { return stream_ == nullptr; }
 
 namespace detail {
 
+CUmemAllocationHandleType nvlsCompatibleMemHandleType = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+
 /// set memory access permission to read-write
 /// @param base Base memory pointer.
 /// @param size Size of the memory.
@@ -96,11 +98,18 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
   MSCCLPP_CUDATHROW(cudaGetDevice(&deviceId));
   MSCCLPP_CUTHROW(cuDeviceGet(&currentDevice, deviceId));
 
+  int requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+  int isFabricSupported;
+  MSCCLPP_CUTHROW(
+      cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, currentDevice));
+  if (isFabricSupported) {
+    requestedHandleTypes |= CU_MEM_HANDLE_TYPE_FABRIC;
+  }
   CUmemAllocationProp prop = {};
   prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
   prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
   prop.requestedHandleTypes =
-      (CUmemAllocationHandleType)(CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR | CU_MEM_HANDLE_TYPE_FABRIC);
+      (CUmemAllocationHandleType)(requestedHandleTypes);
   prop.location.id = currentDevice;
 
   if (gran == 0) {
@@ -110,7 +119,16 @@ void* gpuCallocPhysical(size_t bytes, size_t gran, size_t align) {
   // allocate physical memory
   CUmemGenericAllocationHandle memHandle;
   size_t nbytes = (bytes + gran - 1) / gran * gran;
-  MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0 /*flags*/));
+  CUresult result = cuMemCreate(&memHandle, nbytes, &prop, 0);
+  if (requestedHandleTypes & CU_MEM_HANDLE_TYPE_FABRIC &&
+      (result == CUDA_ERROR_NOT_PERMITTED || result == CUDA_ERROR_NOT_SUPPORTED)) {
+    requestedHandleTypes = CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+    prop.requestedHandleTypes = (CUmemAllocationHandleType)requestedHandleTypes;
+    MSCCLPP_CUTHROW(cuMemCreate(&memHandle, nbytes, &prop, 0));
+  } else {
+    MSCCLPP_CUTHROW(result);
+  }
+  nvlsCompatibleMemHandleType = (CUmemAllocationHandleType)requestedHandleTypes;
 
   if (align == 0) {
     align = getMulticastGranularity(nbytes, CU_MULTICAST_GRANULARITY_MINIMUM);
@@ -172,12 +190,10 @@ bool isNvlsSupported() {
 #if (CUDA_NVLS_SUPPORTED)
   if (!isChecked) {
     int isMulticastSupported;
-    int isFabricSupported;
     CUdevice dev;
     MSCCLPP_CUTHROW(cuCtxGetDevice(&dev));
     MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isMulticastSupported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
-    MSCCLPP_CUTHROW(cuDeviceGetAttribute(&isFabricSupported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
-    result = (isMulticastSupported == 1 && isFabricSupported == 1);
+    return isMulticastSupported == 1;
   }
   return result;
 #endif

diff --git a/src/include/registered_memory.hpp b/src/include/registered_memory.hpp
@@ -28,7 +28,14 @@ struct TransportInfo {
       IbMrInfo ibMrInfo;
     };
     struct {
-      char shareableHandle[64];
+      union {
+        char shareableHandle[64];
+        struct {
+          // These are only defined for multicast (NVLS) capability
+          pid_t rootPid;
+          int fileDesc;
+        };
+      };
       size_t offsetFromBase;
     };
   };
@@ -47,6 +54,9 @@ struct RegisteredMemory::Impl {
   TransportFlags transports;
   std::vector<TransportInfo> transportInfos;
 
+  // For sharing memory handle via file descriptor
+  int fileDesc = -1;
+
   Impl(void* data, size_t size, TransportFlags transports, Context::Impl& contextImpl);
   /// Constructs a RegisteredMemory::Impl from a vector of data. The constructor should only be used for the remote
   /// memory.

diff --git a/src/registered_memory.cc b/src/registered_memory.cc
@@ -3,6 +3,9 @@
 
 #include "registered_memory.hpp"
 
+#include <sys/syscall.h>
+#include <unistd.h>
+
 #include <algorithm>
 #include <mscclpp/gpu_utils.hpp>
 
@@ -24,9 +27,13 @@
   } while (false)
 
 namespace {
-CUmemAllocationHandleType getNvlsCompatibleMemHandleType() {
+CUmemAllocationHandleType getNvlsMemHandleType() {
 #if (CUDA_NVLS_SUPPORTED)
-  return CU_MEM_HANDLE_TYPE_FABRIC;
+  if (mscclpp::detail::nvlsCompatibleMemHandleType & CU_MEM_HANDLE_TYPE_FABRIC) {
+    return CU_MEM_HANDLE_TYPE_FABRIC;
+  } else {
+    return CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR;
+  }
 #else
   throw mscclpp::Error("Only support GPU with NVLS support", mscclpp::ErrorCode::InvalidUsage);
 #endif
@@ -72,8 +79,16 @@ RegisteredMemory::Impl::Impl(void* data, size_t size, TransportFlags transports,
     if (this->isCuMemMapAlloc) {
       CUmemGenericAllocationHandle handle;
       MSCCLPP_CUTHROW(cuMemRetainAllocationHandle(&handle, baseDataPtr));
-      MSCCLPP_CUTHROW(
-          cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsCompatibleMemHandleType(), 0));
+      if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
+        MSCCLPP_CUTHROW(cuMemExportToShareableHandle(transportInfo.shareableHandle, handle, getNvlsMemHandleType(), 0));
+      } else {
+        transportInfo.rootPid = getpid();
+        if (transportInfo.rootPid < 0) {
+          throw mscclpp::SysError("getpid() failed", errno);
+        }
+        MSCCLPP_CUTHROW(cuMemExportToShareableHandle(&transportInfo.fileDesc, handle, getNvlsMemHandleType(), 0));
+        fileDesc = transportInfo.fileDesc;
+      }
       transportInfo.offsetFromBase = (char*)data - (char*)baseDataPtr;
       MSCCLPP_CUTHROW(cuMemRelease(handle));
     } else {
@@ -138,8 +153,13 @@ MSCCLPP_API_CPP std::vector<char> RegisteredMemory::serialize() {
     std::copy_n(reinterpret_cast<char*>(&entry.transport), sizeof(entry.transport), std::back_inserter(result));
     if (entry.transport == Transport::CudaIpc) {
       if (pimpl_->isCuMemMapAlloc) {
-        std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
-                    std::back_inserter(result));
+        if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
+          std::copy_n(reinterpret_cast<char*>(&entry.shareableHandle), sizeof(entry.shareableHandle),
+                      std::back_inserter(result));
+        } else {
+          std::copy_n(reinterpret_cast<char*>(&entry.rootPid), sizeof(entry.rootPid), std::back_inserter(result));
+          std::copy_n(reinterpret_cast<char*>(&entry.fileDesc), sizeof(entry.fileDesc), std::back_inserter(result));
+        }
         std::copy_n(reinterpret_cast<char*>(&entry.offsetFromBase), sizeof(entry.offsetFromBase),
                     std::back_inserter(result));
       } else {
@@ -184,8 +204,16 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
     it += sizeof(transportInfo.transport);
     if (transportInfo.transport == Transport::CudaIpc) {
       if (this->isCuMemMapAlloc) {
-        std::copy_n(it, sizeof(transportInfo.shareableHandle), reinterpret_cast<char*>(&transportInfo.shareableHandle));
-        it += sizeof(transportInfo.shareableHandle);
+        if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
+          std::copy_n(it, sizeof(transportInfo.shareableHandle),
+                      reinterpret_cast<char*>(&transportInfo.shareableHandle));
+          it += sizeof(transportInfo.shareableHandle);
+        } else {
+          std::copy_n(it, sizeof(transportInfo.rootPid), reinterpret_cast<char*>(&transportInfo.rootPid));
+          it += sizeof(transportInfo.rootPid);
+          std::copy_n(it, sizeof(transportInfo.fileDesc), reinterpret_cast<char*>(&transportInfo.fileDesc));
+          it += sizeof(transportInfo.fileDesc);
+        }
         std::copy_n(it, sizeof(transportInfo.offsetFromBase), reinterpret_cast<char*>(&transportInfo.offsetFromBase));
         it += sizeof(transportInfo.offsetFromBase);
       } else {
@@ -220,7 +248,23 @@ RegisteredMemory::Impl::Impl(const std::vector<char>& serialization) {
     if (this->isCuMemMapAlloc) {
 #if (CUDA_NVLS_SUPPORTED)
       CUmemGenericAllocationHandle handle;
-      MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsCompatibleMemHandleType()));
+      if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_FABRIC) {
+        MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, entry.shareableHandle, getNvlsMemHandleType()));
+      } else {
+        int rootPidFd = syscall(SYS_pidfd_open, entry.rootPid, 0);
+        if (rootPidFd < 0) {
+          throw mscclpp::SysError("pidfd_open() failed", errno);
+        }
+        int fileDesc = syscall(SYS_pidfd_getfd, rootPidFd, entry.fileDesc, 0);
+        if (fileDesc < 0) {
+          throw mscclpp::SysError("pidfd_getfd() failed", errno);
+        }
+        INFO(MSCCLPP_P2P, "Get file descriptor %d from pidfd %d on peer 0x%lx", fileDesc, rootPidFd, hostHash);
+        MSCCLPP_CUTHROW(cuMemImportFromShareableHandle(&handle, reinterpret_cast<void*>(fileDesc),
+                                                       CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR));
+        close(rootPidFd);
+        close(fileDesc);
+      }
       size_t minGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_MINIMUM);
       size_t recommendedGran = detail::getMulticastGranularity(size, CU_MULTICAST_GRANULARITY_RECOMMENDED);
       size_t size = (this->size + recommendedGran - 1) / recommendedGran * recommendedGran;
@@ -257,6 +301,9 @@ RegisteredMemory::Impl::~Impl() {
       MSCCLPP_CULOG_WARN(cuMemUnmap((CUdeviceptr)base, size));
       MSCCLPP_CULOG_WARN(cuMemRelease(handle));
       MSCCLPP_CULOG_WARN(cuMemAddressFree((CUdeviceptr)base, size));
+      if (getNvlsMemHandleType() == CU_MEM_HANDLE_TYPE_POSIX_FILE_DESCRIPTOR && fileDesc >= 0) {
+        close(fileDesc);
+      }
     } else {
       cudaError_t err = cudaIpcCloseMemHandle(base);
       if (err != cudaSuccess) {
@@ -266,6 +313,7 @@ RegisteredMemory::Impl::~Impl() {
       }
     }
     data = nullptr;
+    fileDesc = -1;
   }
 }