Merge pull request #503 from abergeron/nccl2

nouiz · web-flow · commit 351f3591396c · 2017-08-23T15:09:09.000-04:00
Upgrade bindings to nccl 2.0
diff --git a/src/gpuarray_collectives_cuda_nccl.c b/src/gpuarray_collectives_cuda_nccl.c
@@ -153,7 +153,7 @@ static int get_rank(const gpucomm *comm, int *rank) {
  * \ref
  * ncclRedOp_t.
  *
- * If invalid, return `nccl_NUM_OPS`.
+ * If invalid, return `ncclNumOps`.
  */
 static inline ncclRedOp_t convert_reduce_op(int opcode) {
   switch (opcode) {
@@ -162,14 +162,14 @@ static inline ncclRedOp_t convert_reduce_op(int opcode) {
   case GA_MAX: return ncclMax;
   case GA_MIN: return ncclMin;
   }
-  return nccl_NUM_OPS;
+  return ncclNumOps;
 }
 
 /**
  * \brief Helper function to try to convert \ref enum GPUARRAY_TYPES to \ref
  * ncclDataType_t.
  *
- * If invalid, return `nccl_NUM_TYPES`.
+ * If invalid, return `ncclNumTypes`.
  */
 static inline ncclDataType_t convert_data_type(int typecode) {
   switch (typecode) {
@@ -181,7 +181,7 @@ static inline ncclDataType_t convert_data_type(int typecode) {
   case GA_ULONG: return ncclUint64;
   case GA_HALF: return ncclHalf;
   }
-  return nccl_NUM_TYPES;
+  return ncclNumTypes;
 }
 
 /**
@@ -208,13 +208,13 @@ static inline int check_restrictions(gpudata *src, size_t offsrc,
   // typecode must correspond to a valid ncclDataType_t
   if (datatype != NULL) {
     *datatype = convert_data_type(typecode);
-    if (*datatype == nccl_NUM_TYPES)
+    if (*datatype == ncclNumTypes)
       return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid data type");
   }
   // opcode must correspond to a valid ncclRedOp_t
   if (op != NULL) {
     *op = convert_reduce_op(opcode);
-    if (*op == nccl_NUM_OPS)
+    if (*op == ncclNumOps)
       return error_set(comm->ctx->err, GA_INVALID_ERROR, "Invalid reduce op");
   }
   // offsets must not be larger than gpudata's size itself
@@ -237,8 +237,8 @@ static int reduce(gpudata *src, size_t offsrc, gpudata *dest, size_t offdest,
                   size_t count, int typecode, int opcode, int root,
                   gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   gpudata *dst = NULL;
   int rank = 0;
   cuda_context *ctx;
@@ -287,8 +287,8 @@ static int all_reduce(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode, int opcode,
                       gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   cuda_context *ctx;
 
   ASSERT_BUF(src);
@@ -325,8 +325,8 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
                           size_t offdest, size_t count, int typecode,
                           int opcode, gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclRedOp_t op = nccl_NUM_OPS;
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclRedOp_t op = ncclNumOps;
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
   cuda_context *ctx;
@@ -371,7 +371,7 @@ static int reduce_scatter(gpudata *src, size_t offsrc, gpudata *dest,
 static int broadcast(gpudata *array, size_t offset, size_t count, int typecode,
                      int root, gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclDataType_t datatype = ncclNumTypes;
   int rank = 0;
   cuda_context *ctx;
 
@@ -411,7 +411,7 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
                       size_t offdest, size_t count, int typecode,
                       gpucomm *comm) {
   // need dummy init so that compiler shuts up
-  ncclDataType_t datatype = nccl_NUM_TYPES;
+  ncclDataType_t datatype = ncclNumTypes;
   int ndev = 0;
   size_t resc_size;
   cuda_context *ctx;
@@ -439,8 +439,8 @@ static int all_gather(gpudata *src, size_t offsrc, gpudata *dest,
 
   // change stream of nccl ops to enable concurrency
   NCCL_EXIT_ON_ERROR(
-      ctx, ncclAllGather((void *)(src->ptr + offsrc), count, datatype,
-                         (void *)(dest->ptr + offdest), comm->c, ctx->s));
+      ctx, ncclAllGather((void *)(src->ptr + offsrc),
+			 (void *)(dest->ptr + offdest), count, datatype, comm->c, ctx->s));
 
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(src, CUDA_WAIT_READ));
   GA_CUDA_EXIT_ON_ERROR(ctx, cuda_record(dest, CUDA_WAIT_WRITE));
diff --git a/src/loaders/libnccl.c b/src/loaders/libnccl.c
@@ -40,6 +40,9 @@ int load_libnccl(error *e) {
 
   #include "libnccl.fn"
 
+  if (ga_func_ptr(lib, "ncclGroupStart", e) == NULL)
+    return error_set(e, GA_LOAD_ERROR, "Found NCCL 1.0 but NCCL 2.0 required");
+
   loaded = 1;
   return GA_NO_ERROR;
 }
diff --git a/src/loaders/libnccl.fn b/src/loaders/libnccl.fn
@@ -4,9 +4,8 @@ DEF_PROC(void, ncclCommDestroy, (ncclComm_t comm));
 DEF_PROC(ncclResult_t, ncclCommCount, (const ncclComm_t comm, int* count));
 DEF_PROC(ncclResult_t, ncclCommUserRank, (const ncclComm_t comm, int* rank));
 DEF_PROC(const char*, ncclGetErrorString, (ncclResult_t result));
-DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuf, int count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, int count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, int recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm,
-cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclBcast, (void* buff, int count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream));
-DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, int count, ncclDataType_t datatype, void* recvbuff, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, int root, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclAllReduce, (const void* sendbuff, void* recvbuff, size_t count, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclReduceScatter, (const void* sendbuff, void* recvbuff, size_t recvcount, ncclDataType_t datatype, ncclRedOp_t op, ncclComm_t comm, cudaStream_t stream));
+DEF_PROC(ncclResult_t, ncclBcast, (void* buff, size_t count, ncclDataType_t datatype, int root, ncclComm_t comm, cudaStream_t stream ));
+DEF_PROC(ncclResult_t, ncclAllGather, (const void* sendbuff, void* recvbuff, size_t sendcount, ncclDataType_t datatype, ncclComm_t comm, cudaStream_t stream));
diff --git a/src/loaders/libnccl.h b/src/loaders/libnccl.h
@@ -13,21 +13,23 @@ typedef struct { char internal[NCCL_UNIQUE_ID_BYTES]; } ncclUniqueId;
 
 typedef enum { ncclSuccess = 0 } ncclResult_t;
 
+/* Reduction operation selector */
 typedef enum { ncclSum        = 0,
                ncclProd       = 1,
                ncclMax        = 2,
                ncclMin        = 3,
-               nccl_NUM_OPS   = 4 } ncclRedOp_t;
-
+               ncclNumOps     = 4 } ncclRedOp_t;
 /* Data types */
-typedef enum { ncclChar       = 0,
-               ncclInt        = 1,
-               ncclHalf       = 2,
-               ncclFloat      = 3,
-               ncclDouble     = 4,
-               ncclInt64      = 5,
-               ncclUint64     = 6,
-               nccl_NUM_TYPES = 7 } ncclDataType_t;
+typedef enum { ncclInt8       = 0, ncclChar       = 0,
+               ncclUint8      = 1,
+               ncclInt32      = 2, ncclInt        = 2,
+               ncclUint32     = 3,
+               ncclInt64      = 4,
+               ncclUint64     = 5,
+               ncclFloat16    = 6, ncclHalf       = 6,
+               ncclFloat32    = 7, ncclFloat      = 7,
+               ncclFloat64    = 8, ncclDouble     = 8,
+               ncclNumTypes   = 9 } ncclDataType_t;
 
 /** @endcond */
 

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,9 @@ int load_libnccl(error *e) {`
`40`	`40`
`41`	`41`	`#include "libnccl.fn"`
`42`	`42`
	`43`	`+ if (ga_func_ptr(lib, "ncclGroupStart", e) == NULL)`
	`44`	`+ return error_set(e, GA_LOAD_ERROR, "Found NCCL 1.0 but NCCL 2.0 required");`
	`45`	`+`
`43`	`46`	`loaded = 1;`
`44`	`47`	`return GA_NO_ERROR;`
`45`	`48`	`}`