1818#include < stdlib.h>
1919#include < string.h>
2020#include " rccl_vars.h"
21- #include < unordered_map>
22- #include < mutex>
23-
24- #define RCCL_HP_MIN_SIZE 2097152
2521
2622#if CUDART_VERSION >= 11030
2723#include < cuda.h>
@@ -35,9 +31,6 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
3531template <>
3632constexpr size_t ncclSizeOfT<void >() { return 1 ; }
3733
38- extern std::unordered_map<void *, size_t > hugepageAllocs;
39- extern std::mutex hugepageAllocsMutex;
40-
4134#if CUDART_VERSION >= 12020
4235
4336static inline ncclResult_t ncclCuMemHostAlloc (void ** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
@@ -112,100 +105,43 @@ static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
112105}
113106
114107#endif /* CUDART_VERSION >= 12020 */
108+
115109template <typename T>
116- ncclResult_t ncclCudaHostCallocDebug (T** ptr, size_t nelem, const char *filefunc, int line, int hp_request= 0 ) {
110+ ncclResult_t ncclCudaHostCallocDebug (T** ptr, size_t nelem, const char *filefunc, int line) {
117111 ncclResult_t result = ncclSuccess;
118112 cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
119113 *ptr = nullptr ;
120- size_t size = nelem * ncclSizeOfT<T>();
121-
122114 CUDACHECK (cudaThreadExchangeStreamCaptureMode (&mode));
123115 int managed = 0 ;
124- int huge=0 ;
125116 CUDACHECK (hipDeviceGetAttribute (&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0 ));
126-
127117 if (nelem > 0 ) {
128118 if (managed) {
129119#if defined(HIP_UNCACHED_MEMORY)
130- CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, size , hipDeviceMallocUncached), result, finish);
120+ CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, nelem*ncclSizeOfT<T>() , hipDeviceMallocUncached), result, finish);
131121#else
132- CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, size , hipDeviceMallocFinegrained), result, finish);
122+ CUDACHECKGOTO (hipExtMallocWithFlags ((void **)ptr, nelem*ncclSizeOfT<T>() , hipDeviceMallocFinegrained), result, finish);
133123#endif
134- } else {
135- if (hp_request) {
136- if (size < RCCL_HP_MIN_SIZE) {
137- WARN (" small size : forcing back to hipHostMalloc" );
124+ } else
138125#if defined(HIP_HOST_UNCACHED_MEMORY)
139- CUDACHECKGOTO (hipHostMalloc (ptr, size , cudaHostAllocMapped | hipHostMallocUncached), result, finish);
126+ CUDACHECKGOTO (hipHostMalloc (ptr, nelem*ncclSizeOfT<T>() , cudaHostAllocMapped | hipHostMallocUncached), result, finish);
140127#else
141- CUDACHECKGOTO (hipHostMalloc (ptr, size , cudaHostAllocMapped), result, finish);
128+ CUDACHECKGOTO (hipHostMalloc (ptr, nelem*ncclSizeOfT<T>() , cudaHostAllocMapped), result, finish);
142129#endif
143- memset (*ptr, 0 , size);
144- } else {
145- // Hugepage allocation via mmap
146- void * hostPtr = mmap (NULL , size, PROT_READ | PROT_WRITE,
147- MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1 , 0 );
148- if (hostPtr == MAP_FAILED) {
149- WARN (" Hugepage allocation failed. Falling back to hipHostMalloc" );
150- #if defined(HIP_HOST_UNCACHED_MEMORY)
151- CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
152- #else
153- CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped), result, finish);
154- #endif
155- memset (*ptr, 0 , size);
156- } else {
157- memset (hostPtr, 0 , size);
158- CUDACHECKGOTO (hipHostRegister (hostPtr, size, hipHostRegisterMapped), result, finish);
159- void * devPtr = nullptr ;
160- CUDACHECKGOTO (hipHostGetDevicePointer (&devPtr, hostPtr, 0 ), result, finish);
161- *ptr = reinterpret_cast <T*>(hostPtr);
162- INFO (NCCL_ALLOC, " Cuda Host Alloc Size done using hugepages" );
163- huge=1 ;
164- std::lock_guard<std::mutex> lock (hugepageAllocsMutex);
165- hugepageAllocs[hostPtr] = size;
166- for (auto &kv : hugepageAllocs) INFO (NCCL_ALLOC, " updated Hugepage alloc ptr %p size %zu" , kv.first , kv.second );
167- }
168- }
169- } else {
170- #if defined(HIP_HOST_UNCACHED_MEMORY)
171- CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
172- #else
173- CUDACHECKGOTO (hipHostMalloc (ptr, size, cudaHostAllocMapped), result, finish);
174- #endif
175- memset (*ptr, 0 , size);
176- }
177- }
130+ memset (*ptr, 0 , nelem*ncclSizeOfT<T>());
178131 }
179-
180132finish:
181133 CUDACHECK (cudaThreadExchangeStreamCaptureMode (&mode));
182- if (*ptr == nullptr && nelem > 0 ) WARN (" Failed to CUDA host alloc %ld bytes" , size );
183- INFO (NCCL_ALLOC, " %s:%d Cuda Host Alloc Size %ld pointer %p hp_request %d managed %d hugepage_alloc %d " , filefunc, line, size , *ptr, hp_request, managed, huge );
134+ if (*ptr == nullptr && nelem > 0 ) WARN (" Failed to CUDA host alloc %ld bytes" , nelem*ncclSizeOfT<T>() );
135+ INFO (NCCL_ALLOC, " %s:%d Cuda Host Alloc Size %ld pointer %p" , filefunc, line, nelem*ncclSizeOfT<T>() , *ptr);
184136 return result;
185137}
186138
187-
188- static inline ncclResult_t ncclCudaHostFree (void * ptr, size_t alloc_size=0 , int hp_request=0 ) {
189- if (hp_request) {
190- if (alloc_size > 0 ) {
191- std::lock_guard<std::mutex> lock (hugepageAllocsMutex);
192- // for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "Hugepage alloc ptr %p size %zu", kv.first, kv.second);
193- auto it = hugepageAllocs.find (ptr);
194- if (it != hugepageAllocs.end ()) {
195- // INFO(NCCL_ALLOC, "%s:%d Cuda Host HugePage unmap size %ld pointer %p app_tracked_size %ld", __FILE__, __LINE__, it->second, ptr, alloc_size);
196- hipHostUnregister (ptr);
197- munmap (ptr, it->second );
198- hugepageAllocs.erase (it);
199- return ncclSuccess;
200- }
201- }
202- INFO (NCCL_ALLOC, " Cudafree being done to %p, size=%ld" , ptr,alloc_size);
203- }
139+ static inline ncclResult_t ncclCudaHostFree (void * ptr) {
204140 CUDACHECK (cudaFreeHost (ptr));
205141 return ncclSuccess;
206142}
207143
208- #define ncclCudaHostCalloc (...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__, 0 )
144+ #define ncclCudaHostCalloc (...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
209145
210146template <typename T>
211147ncclResult_t ncclCallocDebug (T** ptr, size_t nelem, const char *filefunc, int line) {
0 commit comments