1818#include < stdlib.h>
1919#include < string.h>
2020#include " rccl_vars.h"
21+ #include < unordered_map>
22+ #include < mutex>
23+
24+ #define RCCL_HP_MIN_SIZE 2097152
2125
2226#if CUDART_VERSION >= 11030
2327#include < cuda.h>
@@ -31,6 +35,9 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
3135template <>
3236constexpr size_t ncclSizeOfT<void >() { return 1 ; }
3337
38+ extern std::unordered_map<void *, size_t > hugepageAllocs;
39+ extern std::mutex hugepageAllocsMutex;
40+
3441#if CUDART_VERSION >= 12020
3542
3643static inline ncclResult_t ncclCuMemHostAlloc (void ** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
@@ -105,43 +112,100 @@ static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
105112}
106113
107114#endif /* CUDART_VERSION >= 12020 */
108-
109115template <typename T>
110- ncclResult_t ncclCudaHostCallocDebug (T** ptr, size_t nelem, const char *filefunc, int line) {
116+ ncclResult_t ncclCudaHostCallocDebug (T** ptr, size_t nelem, const char *filefunc, int line, int hp_request= 0 ) {
111117 ncclResult_t result = ncclSuccess;
112118 cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
113119 *ptr = nullptr ;
120+ size_t size = nelem * ncclSizeOfT<T>();
121+
114122 CUDACHECK (cudaThreadExchangeStreamCaptureMode (&mode));
115123 int managed = 0 ;
124+ int huge=0 ;
116125 CUDACHECK (hipDeviceGetAttribute (&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0 ));
126+
117127 if (nelem > 0 ) {
118128 if (managed) {
119129#if defined(HIP_UNCACHED_MEMORY)
120- CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, nelem*ncclSizeOfT<T>() , hipDeviceMallocUncached), result, finish);
130+ CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, size , hipDeviceMallocUncached), result, finish);
121131#else
122- CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, nelem*ncclSizeOfT<T>() , hipDeviceMallocFinegrained), result, finish);
132+ CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, size , hipDeviceMallocFinegrained), result, finish);
123133#endif
124- } else
134+ } else {
135+ if (hp_request) {
136+ if (size < RCCL_HP_MIN_SIZE) {
137+ WARN (" small size : forcing back to hipHostMalloc" );
125138#if defined(HIP_HOST_UNCACHED_MEMORY)
126- CUDACHECKGOTO (hipHostMalloc (ptr, nelem*ncclSizeOfT<T>() , cudaHostAllocMapped | hipHostMallocUncached), result, finish);
139+ CUDACHECKGOTO (hipHostMalloc (ptr, size , cudaHostAllocMapped | hipHostMallocUncached), result, finish);
127140#else
128- CUDACHECKGOTO (hipHostMalloc (ptr, nelem*ncclSizeOfT<T>() , cudaHostAllocMapped), result, finish);
141+ CUDACHECKGOTO (hipHostMalloc (ptr, size , cudaHostAllocMapped), result, finish);
129142#endif
130- memset (*ptr, 0 , nelem*ncclSizeOfT<T>());
143+ memset (*ptr, 0 , size);
144+ } else {
145+ // Hugepage allocation via mmap
146+ void * hostPtr = mmap (NULL , size, PROT_READ | PROT_WRITE,
147+ MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1 , 0 );
148+ if (hostPtr == MAP_FAILED) {
149+ WARN (" Hugepage allocation failed. Falling back to hipHostMalloc" );
150+ #if defined(HIP_HOST_UNCACHED_MEMORY)
151+ CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
152+ #else
153+ CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped), result, finish);
154+ #endif
155+ memset (*ptr, 0 , size);
156+ } else {
157+ memset (hostPtr, 0 , size);
158+ CUDACHECKGOTO (hipHostRegister (hostPtr, size, hipHostRegisterMapped), result, finish);
159+ void * devPtr = nullptr ;
160+ CUDACHECKGOTO (hipHostGetDevicePointer (&devPtr, hostPtr, 0 ), result, finish);
161+ *ptr = reinterpret_cast <T*>(hostPtr);
162+ INFO (NCCL_ALLOC, " Cuda Host Alloc Size done using hugepages" );
163+ huge=1 ;
164+ std::lock_guard<std::mutex> lock (hugepageAllocsMutex);
165+ hugepageAllocs[hostPtr] = size;
166+ for (auto &kv : hugepageAllocs) INFO (NCCL_ALLOC, " updated Hugepage alloc ptr %p size %zu" , kv.first , kv.second );
167+ }
168+ }
169+ } else {
170+ #if defined(HIP_HOST_UNCACHED_MEMORY)
171+ CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
172+ #else
173+ CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped), result, finish);
174+ #endif
175+ memset (*ptr, 0 , size);
176+ }
177+ }
131178 }
179+
132180finish:
133181 CUDACHECK (cudaThreadExchangeStreamCaptureMode (&mode));
134- if (*ptr == nullptr && nelem > 0 ) WARN (" Failed to CUDA host alloc %ld bytes" , nelem*ncclSizeOfT<T>() );
135- INFO (NCCL_ALLOC, " %s:%d Cuda Host Alloc Size %ld pointer %p" , filefunc, line, nelem*ncclSizeOfT<T>() , *ptr);
182+ if (*ptr == nullptr && nelem > 0 ) WARN (" Failed to CUDA host alloc %ld bytes" , size );
183+ INFO (NCCL_ALLOC, " %s:%d Cuda Host Alloc Size %ld pointer %p hp_request %d managed %d hugepage_alloc %d " , filefunc, line, size , *ptr, hp_request, managed, huge );
136184 return result;
137185}
138186
139- static inline ncclResult_t ncclCudaHostFree (void * ptr) {
187+
188+ static inline ncclResult_t ncclCudaHostFree (void * ptr, size_t alloc_size=0 , int hp_request=0 ) {
189+ if (hp_request) {
190+ if (alloc_size > 0 ) {
191+ std::lock_guard<std::mutex> lock (hugepageAllocsMutex);
192+ // for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "Hugepage alloc ptr %p size %zu", kv.first, kv.second);
193+ auto it = hugepageAllocs.find (ptr);
194+ if (it != hugepageAllocs.end ()) {
195+ // INFO(NCCL_ALLOC, "%s:%d Cuda Host HugePage unmap size %ld pointer %p app_tracked_size %ld", __FILE__, __LINE__, it->second, ptr, alloc_size);
196+ hipHostUnregister (ptr);
197+ munmap (ptr, it->second );
198+ hugepageAllocs.erase (it);
199+ return ncclSuccess;
200+ }
201+ }
202+ INFO (NCCL_ALLOC, " Cudafree being done to %p, size=%ld" , ptr,alloc_size);
203+ }
140204 CUDACHECK (cudaFreeHost (ptr));
141205 return ncclSuccess;
142206}
143207
144- #define ncclCudaHostCalloc (...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
208+ #define ncclCudaHostCalloc (...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__, 0 )
145209
146210template <typename T>
147211ncclResult_t ncclCallocDebug (T** ptr, size_t nelem, const char *filefunc, int line) {
0 commit comments