Skip to content

Commit 342ec08

Browse files
authored
Revert "changes for hugepages backed host buffer for larger allocations (#1841)" (#1951)
This reverts commit 65b69bf.
1 parent 5978d2f commit 342ec08

File tree

3 files changed

+18
-87
lines changed

3 files changed

+18
-87
lines changed

src/include/alloc.h

Lines changed: 12 additions & 76 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,6 @@
1818
#include <stdlib.h>
1919
#include <string.h>
2020
#include "rccl_vars.h"
21-
#include <unordered_map>
22-
#include <mutex>
23-
24-
#define RCCL_HP_MIN_SIZE 2097152
2521

2622
#if CUDART_VERSION >= 11030
2723
#include <cuda.h>
@@ -35,9 +31,6 @@ constexpr size_t ncclSizeOfT() { return sizeof(T); }
3531
template<>
3632
constexpr size_t ncclSizeOfT<void>() { return 1; }
3733

38-
extern std::unordered_map<void*, size_t> hugepageAllocs;
39-
extern std::mutex hugepageAllocsMutex;
40-
4134
#if CUDART_VERSION >= 12020
4235

4336
static inline ncclResult_t ncclCuMemHostAlloc(void** ptr, CUmemGenericAllocationHandle *handlep, size_t size) {
@@ -112,100 +105,43 @@ static inline ncclResult_t ncclCuMemHostFree(void* ptr) {
112105
}
113106

114107
#endif /* CUDART_VERSION >= 12020 */
108+
115109
template <typename T>
116-
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line, int hp_request=0 ) {
110+
ncclResult_t ncclCudaHostCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {
117111
ncclResult_t result = ncclSuccess;
118112
cudaStreamCaptureMode mode = cudaStreamCaptureModeRelaxed;
119113
*ptr = nullptr;
120-
size_t size = nelem * ncclSizeOfT<T>();
121-
122114
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
123115
int managed = 0;
124-
int huge=0;
125116
CUDACHECK(hipDeviceGetAttribute(&managed, hipDeviceAttributeDirectManagedMemAccessFromHost, 0));
126-
127117
if (nelem > 0) {
128118
if (managed) {
129119
#if defined(HIP_UNCACHED_MEMORY)
130-
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, size, hipDeviceMallocUncached), result, finish);
120+
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), hipDeviceMallocUncached), result, finish);
131121
#else
132-
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, size, hipDeviceMallocFinegrained), result, finish);
122+
CUDACHECKGOTO(hipExtMallocWithFlags((void**)ptr, nelem*ncclSizeOfT<T>(), hipDeviceMallocFinegrained), result, finish);
133123
#endif
134-
} else {
135-
if (hp_request) {
136-
if (size < RCCL_HP_MIN_SIZE) {
137-
WARN("small size : forcing back to hipHostMalloc");
124+
} else
138125
#if defined(HIP_HOST_UNCACHED_MEMORY)
139-
CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
126+
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped | hipHostMallocUncached), result, finish);
140127
#else
141-
CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish);
128+
CUDACHECKGOTO(hipHostMalloc(ptr, nelem*ncclSizeOfT<T>(), cudaHostAllocMapped), result, finish);
142129
#endif
143-
memset(*ptr, 0, size);
144-
} else {
145-
// Hugepage allocation via mmap
146-
void* hostPtr = mmap(NULL, size, PROT_READ | PROT_WRITE,
147-
MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB, -1, 0);
148-
if (hostPtr == MAP_FAILED) {
149-
WARN("Hugepage allocation failed. Falling back to hipHostMalloc");
150-
#if defined(HIP_HOST_UNCACHED_MEMORY)
151-
CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
152-
#else
153-
CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish);
154-
#endif
155-
memset(*ptr, 0, size);
156-
} else {
157-
memset(hostPtr, 0, size);
158-
CUDACHECKGOTO(hipHostRegister(hostPtr, size, hipHostRegisterMapped), result, finish);
159-
void* devPtr = nullptr;
160-
CUDACHECKGOTO(hipHostGetDevicePointer(&devPtr, hostPtr, 0), result, finish);
161-
*ptr = reinterpret_cast<T*>(hostPtr);
162-
INFO(NCCL_ALLOC, "Cuda Host Alloc Size done using hugepages");
163-
huge=1;
164-
std::lock_guard<std::mutex> lock(hugepageAllocsMutex);
165-
hugepageAllocs[hostPtr] = size;
166-
for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "updated Hugepage alloc ptr %p size %zu", kv.first, kv.second);
167-
}
168-
}
169-
} else {
170-
#if defined(HIP_HOST_UNCACHED_MEMORY)
171-
CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped | hipHostMallocUncached), result, finish);
172-
#else
173-
CUDACHECKGOTO(hipHostMalloc(ptr, size, cudaHostAllocMapped), result, finish);
174-
#endif
175-
memset(*ptr, 0, size);
176-
}
177-
}
130+
memset(*ptr, 0, nelem*ncclSizeOfT<T>());
178131
}
179-
180132
finish:
181133
CUDACHECK(cudaThreadExchangeStreamCaptureMode(&mode));
182-
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", size);
183-
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p hp_request %d managed %d hugepage_alloc %d", filefunc, line, size, *ptr, hp_request, managed, huge);
134+
if (*ptr == nullptr && nelem > 0) WARN("Failed to CUDA host alloc %ld bytes", nelem*ncclSizeOfT<T>());
135+
INFO(NCCL_ALLOC, "%s:%d Cuda Host Alloc Size %ld pointer %p", filefunc, line, nelem*ncclSizeOfT<T>(), *ptr);
184136
return result;
185137
}
186138

187-
188-
static inline ncclResult_t ncclCudaHostFree(void* ptr, size_t alloc_size=0, int hp_request=0) {
189-
if (hp_request) {
190-
if (alloc_size > 0) {
191-
std::lock_guard<std::mutex> lock(hugepageAllocsMutex);
192-
// for (auto &kv : hugepageAllocs) INFO(NCCL_ALLOC, "Hugepage alloc ptr %p size %zu", kv.first, kv.second);
193-
auto it = hugepageAllocs.find(ptr);
194-
if (it != hugepageAllocs.end()) {
195-
// INFO(NCCL_ALLOC, "%s:%d Cuda Host HugePage unmap size %ld pointer %p app_tracked_size %ld", __FILE__, __LINE__, it->second, ptr, alloc_size);
196-
hipHostUnregister(ptr);
197-
munmap(ptr, it->second);
198-
hugepageAllocs.erase(it);
199-
return ncclSuccess;
200-
}
201-
}
202-
INFO(NCCL_ALLOC, "Cudafree being done to %p, size=%ld", ptr,alloc_size);
203-
}
139+
static inline ncclResult_t ncclCudaHostFree(void* ptr) {
204140
CUDACHECK(cudaFreeHost(ptr));
205141
return ncclSuccess;
206142
}
207143

208-
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__, 0)
144+
#define ncclCudaHostCalloc(...) ncclCudaHostCallocDebug(__VA_ARGS__, __FILE__, __LINE__)
209145

210146
template <typename T>
211147
ncclResult_t ncclCallocDebug(T** ptr, size_t nelem, const char *filefunc, int line) {

src/init.cc

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -95,8 +95,6 @@ NCCL_PARAM(NvlsChannels, "NVLS_NCHANNELS", NCCL_CONFIG_UNDEF_INT);
9595

9696
struct allocationTracker allocTracker[MAX_ALLOC_TRACK_NGPU] = {};
9797
static ncclResult_t commReclaim(ncclComm_t comm);
98-
std::unordered_map<void*, size_t> hugepageAllocs;
99-
std::mutex hugepageAllocsMutex;
10098

10199
#ifdef ENABLE_MSCCLPP
102100
size_t std::hash<ncclUniqueId>::operator ()(const ncclUniqueId& uniqueId) const noexcept {

src/transport/net.cc

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ static ncclResult_t canConnect(int* ret, struct ncclComm* comm, struct ncclTopoG
179179
NCCL_PARAM(NetSharedBuffers, "NET_SHARED_BUFFERS", -2);
180180
NCCL_PARAM(NetSharedComms, "NET_SHARED_COMMS", 1);
181181

182-
RCCL_PARAM(NetHostBufferHugePageAlloc, "NET_HOST_BUFFER_HUGE_PAGE_ALLOC", 0);
183182
#if defined(HIP_CONTIGUOUS_MEMORY)
184183
RCCL_PARAM(NetContiguousMem, "NET_CONTIGUOUS_MEM", 0);
185184
#endif
@@ -603,7 +602,7 @@ static ncclResult_t sharedNetBuffersInit(struct ncclProxyState* proxyState, int
603602
}
604603
}
605604
if (!cuda && state->hostBuff == NULL) {
606-
NCCLCHECK(ncclCudaHostCallocDebug(&state->hostBuff, state->size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));
605+
NCCLCHECK(ncclCudaHostCalloc(&state->hostBuff, state->size));
607606
}
608607
if (cpuPtr) *cpuPtr = cuda ? state->cudaBuff : state->hostBuff;
609608
if (gpuPtr) *gpuPtr = (cpuPtr && sameProcess) ? *cpuPtr : NULL;
@@ -632,9 +631,7 @@ static ncclResult_t sharedNetBuffersDestroy(struct ncclProxyState* proxyState, i
632631
}
633632
NCCLCHECK(ncclCudaFree(state->cudaBuff));
634633
}
635-
if (state->hostBuff) {
636-
NCCLCHECK(ncclCudaHostFree(state->hostBuff, (state->size)*(sizeof(int64_t)), rcclParamNetHostBufferHugePageAlloc()));
637-
}
634+
if (state->hostBuff) NCCLCHECK(ncclCudaHostFree(state->hostBuff));
638635
}
639636

640637
if (peer->send.refcount || peer->recv.refcount) return ncclSuccess;
@@ -891,7 +888,7 @@ static ncclResult_t sendProxyConnect(struct ncclProxyConnection* connection, str
891888
}
892889
}
893890
if (map->sameProcess) {
894-
NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));
891+
NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
895892
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
896893
} else {
897894
NCCLCHECK(netCreateShm(proxyState, map->mems+NCCL_NET_MAP_HOSTMEM));
@@ -1093,7 +1090,7 @@ static ncclResult_t recvProxyConnect(struct ncclProxyConnection* connection, str
10931090
map->mems[NCCL_NET_MAP_DEVMEM].cpuPtr = map->mems[NCCL_NET_MAP_DEVMEM].gpuPtr;
10941091
}
10951092
}
1096-
NCCLCHECK(ncclCudaHostCallocDebug(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size, __FILE__, __LINE__, rcclParamNetHostBufferHugePageAlloc()));
1093+
NCCLCHECK(ncclCudaHostCalloc(&map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, map->mems[NCCL_NET_MAP_HOSTMEM].size));
10971094
map->mems[NCCL_NET_MAP_HOSTMEM].gpuPtr = map->mems[NCCL_NET_MAP_HOSTMEM].cpuPtr;
10981095
if (ncclGdrCopy && map->sameProcess) {
10991096
uint64_t *cpuPtr, *gpuPtr;
@@ -1168,7 +1165,7 @@ static ncclResult_t sendProxyFree(struct ncclProxyConnection* connection, struct
11681165
}
11691166
struct connectMapMem* mems = resources->map.mems;
11701167
if (resources->map.sameProcess) {
1171-
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc()));
1168+
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
11721169
} else {
11731170
NCCLCHECK(ncclShmIpcClose(&mems[NCCL_NET_MAP_HOSTMEM].createDesc));
11741171
}
@@ -1212,7 +1209,7 @@ static ncclResult_t recvProxyFree(struct ncclProxyConnection* connection, struct
12121209
}
12131210
}
12141211
struct connectMapMem* mems = resources->map.mems;
1215-
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr, (mems[NCCL_NET_MAP_HOSTMEM].size)*(sizeof(int)), rcclParamNetHostBufferHugePageAlloc()));
1212+
NCCLCHECK(ncclCudaHostFree(mems[NCCL_NET_MAP_HOSTMEM].cpuPtr));
12161213
NCCLCHECK(ncclCudaFree(mems[NCCL_NET_MAP_DEVMEM].cpuPtr));
12171214
if (!resources->map.sameProcess || ncclCuMemEnable()) {
12181215
// cuMem API support

0 commit comments

Comments
 (0)