Skip to content

Commit 47bb672

Browse files
borisfomabergeron
authored andcommitted
NCCL 2.0 fix
1 parent 3bab96a commit 47bb672

File tree

3 files changed

+33
-32
lines changed

3 files changed

+33
-32
lines changed

src/gpuarray_collectives_cuda_nccl.c

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) {
153153
* \ref
154154
* ncclRedOp_t.
155155
*
156-
* If invalid, return `nccl_NUM_OPS`.
156+
* If invalid, return `ncclNumOps`.
157157
*/
158158
static inline ncclRedOp_t convert_reduce_op(int opcode) {
159159
switch (opcode) {
@@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) {
162162
case GA_MAX: return ncclMax;
163163
case GA_MIN: return ncclMin;
164164
}
165-
return nccl_NUM_OPS;
165+
return ncclNumOps;
166166
}
167167

168168
/**
169169
* \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref
170170
* ncclDataType_t.
171171
*
172-
* If invalid, return `nccl_NUM_TYPES`.
172+
* If invalid, return `ncclNumTypes`.
173173
*/
174174
static inline ncclDataType_t convert_data_type(int typecode) {
175175
switch (typecode) {
@@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) {
181181
case GA_ULONG: return ncclUint64;
182182
case GA_HALF: return ncclHalf;
183183
}
184-
return nccl_NUM_TYPES;
184+
return ncclNumTypes;
185185
}
186186

187187
/**
@@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc,
208208
// typecode must correspond to a valid ncclDataType_t
209209
if (datatype != NULL) {
210210
*datatype = convert_data_type(typecode);
211-
if (*datatype == nccl_NUM_TYPES)
211+
if (*datatype == ncclNumTypes)
212212
return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type");
213213
}
214214
// opcode must correspond to a valid ncclRedOp_t
215215
if (op != NULL) {
216216
*op = convert_reduce_op(opcode);
217-
if (*op == nccl_NUM_OPS)
217+
if (*op == ncclNumOps)
218218
return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op");
219219
}
220220
// offsets must not be larger than gpudata's size itself
@@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest,
237237
size_t count, int typecode, int opcode, int root,
238238
gpucomm *comm) {
239239
// need dummy init so that compiler shuts up
240-
ncclRedOp_t op = nccl_NUM_OPS;
241-
ncclDataType_t datatype = nccl_NUM_TYPES;
240+
ncclRedOp_t op = ncclNumOps;
241+
ncclDataType_t datatype = ncclNumTypes;
242242
gpudata *dst = NULL;
243243
int rank = 0;
244244
cuda_context *ctx;
@@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
287287
size_t offdest, size_t count, int typecode, int opcode,
288288
gpucomm *comm) {
289289
// need dummy init so that compiler shuts up
290-
ncclRedOp_t op = nccl_NUM_OPS;
291-
ncclDataType_t datatype = nccl_NUM_TYPES;
290+
ncclRedOp_t op = ncclNumOps;
291+
ncclDataType_t datatype = ncclNumTypes;
292292
cuda_context *ctx;
293293

294294
ASSERT_BUF(src);
@@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
325325
size_t offdest, size_t count, int typecode,
326326
int opcode, gpucomm *comm) {
327327
// need dummy init so that compiler shuts up
328-
ncclRedOp_t op = nccl_NUM_OPS;
329-
ncclDataType_t datatype = nccl_NUM_TYPES;
328+
ncclRedOp_t op = ncclNumOps;
329+
ncclDataType_t datatype = ncclNumTypes;
330330
int ndev = 0;
331331
size_t resc_size;
332332
cuda_context *ctx;
@@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
371371
static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
372372
int root, gpucomm *comm) {
373373
// need dummy init so that compiler shuts up
374-
ncclDataType_t datatype = nccl_NUM_TYPES;
374+
ncclDataType_t datatype = ncclNumTypes;
375375
int rank = 0;
376376
cuda_context *ctx;
377377

@@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
411411
size_t offdest, size_t count, int typecode,
412412
gpucomm *comm) {
413413
// need dummy init so that compiler shuts up
414-
ncclDataType_t datatype = nccl_NUM_TYPES;
414+
ncclDataType_t datatype = ncclNumTypes;
415415
int ndev = 0;
416416
size_t resc_size;
417417
cuda_context *ctx;
@@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
439439

440440
// change stream of nccl ops to enable concurrency
441441
NCCL_EXIT_ON_ERROR(
442-
ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype,
443-
(void *)(dest->ptr + offdest), comm->c, ctx->s));
442+
ctx, ncclAllGather((void *)(src->ptr + offsrc),
443+
(void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s));
444444

445445
GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
446446
GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));

src/loaders/libnccl.fn

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm));
44
DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count));
55
DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank));
66
DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result));
7-
DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
8-
DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
9-
DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
10-
cudaStream_t stream));
11-
DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream));
12-
DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream));
7+
DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
8+
DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream ));
9+
DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
10+
DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
11+
DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));

src/loaders/libnccl.h

Lines changed: 12 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
1313

1414
typedef enum { ncclSuccess = 0 } ncclResult_t;
1515

16+
/* Reduction operation selector */
1617
typedef enum { ncclSum = 0,
1718
ncclProd = 1,
1819
ncclMax = 2,
1920
ncclMin = 3,
20-
nccl_NUM_OPS = 4 } ncclRedOp_t;
21-
21+
ncclNumOps = 4 } ncclRedOp_t;
2222
/* Data types */
23-
typedef enum { ncclChar = 0,
24-
ncclInt = 1,
25-
ncclHalf = 2,
26-
ncclFloat = 3,
27-
ncclDouble = 4,
28-
ncclInt64 = 5,
29-
ncclUint64 = 6,
30-
nccl_NUM_TYPES = 7 } ncclDataType_t;
23+
typedef enum { ncclInt8 = 0, ncclChar = 0,
24+
ncclUint8 = 1,
25+
ncclInt32 = 2, ncclInt = 2,
26+
ncclUint32 = 3,
27+
ncclInt64 = 4,
28+
ncclUint64 = 5,
29+
ncclFloat16 = 6, ncclHalf = 6,
30+
ncclFloat32 = 7, ncclFloat = 7,
31+
ncclFloat64 = 8, ncclDouble = 8,
32+
ncclNumTypes = 9 } ncclDataType_t;
3133

3234
/** @endcond */
3335

0 commit comments

Comments
 (0)