changes for hugepages backed host buffer for larger allocations (#1841)

bmital · web-flow · commit 3169352cad61 · 2025-09-28T00:40:22.000-05:00
[ROCm/rccl commit: 65b69bf]
diff --git a/projects/rccl/src/include/alloc.h b/projects/rccl/src/include/alloc.h
@@ -18,6 +18,10 @@
 #include <stdlib.h>
 #include <string.h>
 #include "rccl_vars.h"
+#include <unordered_map>
+#include <mutex>
+
+#define RCCL_HP_MIN_SIZE 2097152
 
 #if CUDART_VERSION >= 11030
 #include <cuda.h>
@@ -31,6 +35,9 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
 template<>
 constexpr size_t ncclSizeOfT<void>() { return 1; }
 
+extern std::unordered_map<void*, size_t> hugepageAllocs;
+extern std::mutex hugepageAllocsMutex;
+
 #if CUDART_VERSION >= 12020
 
 static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
@@ -105,43 +112,100 @@ static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
 }
 
 #endif  /* CUDART_VERSION >= 12020 */
-
 template <typename T>
-ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
+ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line, int hp_request=0 ) {
   ncclResult_t result = ncclSuccess;
   cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
   *ptr = nullptr;
+  size_t size = nelem * ncclSizeOfT<T>();
+
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
   int managed = 0;
+  int huge=0;
   CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
+
   if (nelem > 0) {
     if (managed) {
 #if defined(HIP_UNCACHED_MEMORY)
-      CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), hipDeviceMallocUncached), result, finish);
+      CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, size, hipDeviceMallocUncached), result, finish);
 #else
-      CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), hipDeviceMallocFinegrained), result, finish);
+      CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, size, hipDeviceMallocFinegrained), result, finish);
 #endif
-    } else
+    } else {
+      if (hp_request) {
+	      if (size < RCCL_HP_MIN_SIZE) {
+		WARN("small size : forcing back to hipHostMalloc");
 #if defined(HIP_HOST_UNCACHED_MEMORY)
-      CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped | hipHostMallocUncached), result, finish);
+		CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
 #else
-      CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
+		CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish);
 #endif
-    memset(*ptr, 0, nelem*ncclSizeOfT<T>());
+		memset(*ptr, 0, size);
+	      } else {
+		      // Hugepage allocation via mmap
+		      void* hostPtr = mmap(NULL, size, PROT_READ | PROT_WRITE,
+					   MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
+		      if (hostPtr == MAP_FAILED) {
+			      WARN("Hugepage allocation failed. Falling back to hipHostMalloc");
+#if defined(HIP_HOST_UNCACHED_MEMORY)
+				CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
+#else
+				CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish);
+#endif
+				memset(*ptr, 0, size);
+		      } else {
+			      memset(hostPtr, 0, size);
+			      CUDACHECKGOTO(hipHostRegister(hostPtr, size, hipHostRegisterMapped), result, finish);
+			      void* devPtr = nullptr;
+			      CUDACHECKGOTO(hipHostGetDevicePointer(&devPtr, hostPtr, 0), result, finish);
+			      *ptr = reinterpret_cast<T*>(hostPtr);
+			      INFO(NCCL_ALLOC, "Cuda Host Alloc Size done using hugepages");
+			      huge=1;
+			      std::lock_guard<std::mutex> lock(hugepageAllocsMutex);
+			      hugepageAllocs[hostPtr] = size;
+    			     for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "updated Hugepage alloc ptr %p size %zu", kv.first, kv.second);
+		      }
+	      }
+    } else {
+#if defined(HIP_HOST_UNCACHED_MEMORY)
+    	CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
+#else
+	CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish);
+#endif
+	memset(*ptr, 0, size);
+    }
+    }
   }
+
 finish:
   CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
-  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT<T>());
-  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
+  if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", size);
+  INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p hp_request %d managed %d hugepage_alloc %d", filefunc, line, size, *ptr, hp_request, managed, huge);
   return result;
 }
 
-static inline ncclResult_t ncclCudaHostFree(void* ptr) {
+
+static inline ncclResult_t ncclCudaHostFree(void* ptr, size_t alloc_size=0, int hp_request=0) {
+  if (hp_request) {
+    if (alloc_size > 0) { 
+      std::lock_guard<std::mutex> lock(hugepageAllocsMutex);
+      // for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "Hugepage alloc ptr %p size %zu", kv.first, kv.second);
+      auto it = hugepageAllocs.find(ptr);
+      if (it != hugepageAllocs.end()) {
+        // INFO(NCCL_ALLOC, "%s:%d Cuda Host HugePage unmap size %ld pointer %p app_tracked_size %ld", __FILE__, __LINE__, it->second, ptr, alloc_size);
+        hipHostUnregister(ptr);
+        munmap(ptr, it->second);
+        hugepageAllocs.erase(it);
+        return ncclSuccess;
+      }
+    }
+    INFO(NCCL_ALLOC, "Cudafree being done to  %p, size=%ld", ptr,alloc_size);
+  } 
   CUDACHECK(cudaFreeHost(ptr));
   return ncclSuccess;
 }
 
-#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
+#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__, 0)
 
 template <typename T>
 ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
diff --git a/projects/rccl/src/init.cc b/projects/rccl/src/init.cc
@@ -95,6 +95,8 @@ NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT);
 
 struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
 static ncclResult_t commReclaim(ncclComm_t comm);
+std::unordered_map<void*, size_t> hugepageAllocs;
+std::mutex hugepageAllocsMutex;
 
 #ifdef ENABLE_MSCCLPP
 size_t std::hash<ncclUniqueId>::operator ()(const ncclUniqueId& uniqueId) const noexcept {
diff --git a/projects/rccl/src/transport/net.cc b/projects/rccl/src/transport/net.cc
@@ -179,6 +179,7 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG
 NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
 NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
 
+RCCL_PARAM(NetHostBufferHugePageAlloc, "NET_HOST_BUFFER_HUGE_PAGE_ALLOC", 0);
 #if defined(HIP_CONTIGUOUS_MEMORY)
 RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0);
 #endif
@@ -602,7 +603,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
     }
   }
   if (!cuda && state->hostBuff == NULL) {
-    NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
+    NCCLCHECK(ncclCudaHostCallocDebug(&state->hostBuff, state->size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));
   }
   if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
   if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL;
@@ -631,7 +632,9 @@ static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, i
       }
       NCCLCHECK(ncclCudaFree(state->cudaBuff));
     }
-    if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
+    if (state->hostBuff) {
+      NCCLCHECK(ncclCudaHostFree(state->hostBuff, (state->size)*(sizeof(int64_t)), rcclParamNetHostBufferHugePageAlloc()));
+    }
   }
 
   if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
@@ -888,7 +891,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
     }
   }
   if (map->sameProcess) {
-    NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+    NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));
     map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
   } else {
     NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM));
@@ -1090,7 +1093,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
       map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
     }
   }
-  NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
+  NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));
   map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
   if (ncclGdrCopy && map->sameProcess) {
     uint64_t *cpuPtr, *gpuPtr;
@@ -1165,7 +1168,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
     }
     struct connectMapMem* mems = resources->map.mems;
     if (resources->map.sameProcess) {
-      NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+      NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc()));
     } else {
       NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc));
     }
@@ -1209,7 +1212,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
       }
     }
     struct connectMapMem* mems = resources->map.mems;
-    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
+    NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc()));
     NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
     if (!resources->map.sameProcess || ncclCuMemEnable()) {
       // cuMem API support

Original file line number	Diff line number	Diff line change
`@@ -179,6 +179,7 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG`
`179`	`179`	`NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);`
`180`	`180`	`NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);`
`181`	`181`
	`182`	`+RCCL_PARAM(NetHostBufferHugePageAlloc, "NET_HOST_BUFFER_HUGE_PAGE_ALLOC", 0);`
`182`	`183`	`#if defined(HIP_CONTIGUOUS_MEMORY)`
`183`	`184`	`RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0);`
`184`	`185`	`#endif`
`@@ -602,7 +603,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int`
`602`	`603`	`}`
`603`	`604`	`}`
`604`	`605`	`if (!cuda && state->hostBuff == NULL) {`
`605`		`- NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));`
	`606`	`+ NCCLCHECK(ncclCudaHostCallocDebug(&state->hostBuff, state->size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));`
`606`	`607`	`}`
`607`	`608`	`if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;`
`608`	`609`	`if (gpuPtr) gpuPtr = (cpuPtr && sameProcess) ? cpuPtr : NULL;`
`@@ -631,7 +632,9 @@ static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, i`
`631`	`632`	`}`
`632`	`633`	`NCCLCHECK(ncclCudaFree(state->cudaBuff));`
`633`	`634`	`}`
`634`		`- if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));`
	`635`	`+ if (state->hostBuff) {`
	`636`	`+ NCCLCHECK(ncclCudaHostFree(state->hostBuff, (state->size)*(sizeof(int64_t)), rcclParamNetHostBufferHugePageAlloc()));`
	`637`	`+ }`
`635`	`638`	`}`
`636`	`639`
`637`	`640`	`if (peer->send.refcount \|\| peer->recv.refcount) return ncclSuccess;`
`@@ -888,7 +891,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str`
`888`	`891`	`}`
`889`	`892`	`}`
`890`	`893`	`if (map->sameProcess) {`
`891`		`- NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));`
	`894`	`+ NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));`
`892`	`895`	`map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;`
`893`	`896`	`} else {`
`894`	`897`	`NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM));`
`@@ -1090,7 +1093,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str`
`1090`	`1093`	`map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;`
`1091`	`1094`	`}`
`1092`	`1095`	`}`
`1093`		`- NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));`
	`1096`	`+ NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));`
`1094`	`1097`	`map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;`
`1095`	`1098`	`if (ncclGdrCopy && map->sameProcess) {`
`1096`	`1099`	`uint64_t cpuPtr, gpuPtr;`
`@@ -1165,7 +1168,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct`
`1165`	`1168`	`}`
`1166`	`1169`	`struct connectMapMem* mems = resources->map.mems;`
`1167`	`1170`	`if (resources->map.sameProcess) {`
`1168`		`- NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));`
	`1171`	`+ NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc()));`
`1169`	`1172`	`} else {`
`1170`	`1173`	`NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc));`
`1171`	`1174`	`}`
`@@ -1209,7 +1212,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct`
`1209`	`1212`	`}`
`1210`	`1213`	`}`
`1211`	`1214`	`struct connectMapMem* mems = resources->map.mems;`
`1212`		`- NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));`
	`1215`	`+ NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc()));`
`1213`	`1216`	`NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));`
`1214`	`1217`	`if (!resources->map.sameProcess \|\| ncclCuMemEnable()) {`
`1215`	`1218`	`// cuMem API support`