Merge pull request #502 from abergeron/cuda9

nouiz · web-flow · commit 9885f08e5378 · 2017-08-25T08:44:28.000-04:00
Changes for CUDA 9.0 float16 support.
diff --git a/pygpu/collectives.pyx b/pygpu/collectives.pyx
@@ -38,41 +38,6 @@ cdef class GpuCommCliqueId:
         if comm_id is not None:
             self.comm_id = comm_id
 
-    def __getbuffer__(self, Py_buffer* buffer, int flags):
-        if buffer == NULL:
-            raise BufferError, "NULL buffer view in getbuffer"
-
-        buffer.buf = <char*>self.c_comm_id.internal
-        buffer.obj = self
-        buffer.len = GA_COMM_ID_BYTES * sizeof(char)
-        buffer.readonly = 0
-        buffer.itemsize = sizeof(char)
-        if flags & PyBUF_FORMAT == PyBUF_FORMAT:
-            buffer.format = 'b'
-        else:
-            buffer.format = NULL
-        buffer.ndim = 1
-        if flags & PyBUF_ND == PyBUF_ND:
-            buffer.shape = <Py_ssize_t*>calloc(1, sizeof(Py_ssize_t))
-            buffer.shape[0] = GA_COMM_ID_BYTES
-        else:
-            buffer.shape = NULL
-        if flags & PyBUF_STRIDES == PyBUF_STRIDES:
-            buffer.strides = &buffer.itemsize
-        else:
-            buffer.strides = NULL
-        buffer.suboffsets = NULL
-        buffer.internal = NULL
-        Py_INCREF(self)
-
-    def __releasebuffer__(self, Py_buffer* buffer):
-        if buffer == NULL:
-            raise BufferError, "NULL buffer view in releasebuffer"
-
-        if buffer.shape != NULL:
-            free(buffer.shape)
-        Py_DECREF(self)
-
     def __richcmp__(this, that, int op):
         if type(this) != type(that):
             raise TypeError, "Cannot compare %s with %s" % (type(this), type(that))
diff --git a/pygpu/tests/test_collectives.py b/pygpu/tests/test_collectives.py
@@ -88,13 +88,6 @@ def test_richcmp(self):
         with self.assertRaises(TypeError):
             a = cid2 > "asdfasfa"
 
-    def test_as_buffer(self):
-        a = np.asarray(self.cid)
-        assert np.allclose(a, self.cid.comm_id)
-        a[:] = [ord(b'a')] * COMM_ID_BYTES
-        assert np.allclose(a, self.cid.comm_id)
-
-
 @unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module")
 @unittest.skipIf(get_user_gpu_rank() == -1, "Collective operations supported on CUDA devices only")
 class TestGpuComm(unittest.TestCase):
@@ -293,19 +286,19 @@ def test_all_gather(self):
 
         a = cpu.reshape((5, 2), order='F')
         exp = texp.reshape((5, 2 * self.size), order='F')
-        gpu = gpuarray.asarray(a, context=self.ctx)
+        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
         resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
         check_all(resgpu, exp)
 
         a = cpu.reshape((5, 2), order='F')
         exp = texp.reshape((5, 2, self.size), order='F')
-        gpu = gpuarray.asarray(a, context=self.ctx)
+        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
         resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
         check_all(resgpu, exp)
 
         a = cpu.reshape((5, 2), order='F')
         exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
-        gpu = gpuarray.asarray(a, context=self.ctx)
+        gpu = gpuarray.asarray(a, context=self.ctx, order='F')
         resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
         check_all(resgpu, exp)
 
diff --git a/src/cluda_cuda.h b/src/cluda_cuda.h
@@ -65,10 +65,14 @@ struct ga_half {
   ga_ushort data;
 };
 
-#define ga_half2float(p) __half2float((p).data)
-__device__ static inline ga_half ga_float2half(float f) {
+static __device__ inline float ga_half2float(ga_half h) {
+  float r;
+  asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(r) : "h"(h.data));
+  return r;
+}
+static __device__ inline ga_half ga_float2half(float f) {
   ga_half r;
-  r.data = __float2half_rn(f);
+  asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(r.data) : "f"(f));
   return r;
 }
 
@@ -142,7 +146,7 @@ __device__ ga_half atom_add_eg(ga_half *addr, ga_half val) {
   do {
     assumed = old;
     tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410);
-    sum = __float2half_rn(__half2float(val.data) + __half2float(tmp.data));
+    sum = ga_float2half(ga_half2float(val) + ga_half2float(tmp)).data;
     new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
     old = atomicCAS(base, assumed, new_);
   } while (assumed != old);
diff --git a/src/cluda_cuda.h.c b/src/cluda_cuda.h.c
diff --git a/src/gpuarray/ext_cuda.h b/src/gpuarray/ext_cuda.h
@@ -2,6 +2,7 @@
 #define LIBGPU_EXT_CUDA
 
 #include <cuda.h>
+#include <cuda_fp16.h>
 
 #include <gpuarray/config.h>
 #include <gpuarray/buffer.h>
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
@@ -2,6 +2,7 @@
 
 #include "private.h"
 #include "private_cuda.h"
+
 #include "loaders/libnvrtc.h"
 #include "loaders/libcublas.h"
 
@@ -1087,9 +1088,11 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
   size_t buflen;
   const char *heads[1] = {"cluda.h"};
   const char *hsrc[1];
-  const char *opts[4] = {
+  const char *opts[] = {
     "-arch", ""
+#ifdef DEBUG
     , "-G", "-lineinfo"
+#endif
   };
   nvrtcResult err;
 
@@ -1100,13 +1103,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
   if (err != NVRTC_SUCCESS)
     return error_nvrtc(ctx->err, "nvrtcCreateProgram", err);
 
-  err = nvrtcCompileProgram(prog,
-#ifdef DEBUG
-                            4,
-#else
-                            2,
-#endif
-                            opts);
+  err = nvrtcCompileProgram(prog, sizeof(opts)/sizeof(char *), opts);
 
   /* Get the log before handling the error */
   if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) {
diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c
@@ -547,7 +547,6 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
 static int check_contig(GpuElemwise *ge, void **args,
                         size_t *_n, int *contig) {
   GpuArray *a = NULL, *v;
-  gpucontext *ctx = GpuKernel_context(&ge->k_contig);
   size_t n = 1;
   unsigned int i, j;
   int c_contig = 1, f_contig = 1;
@@ -563,10 +562,10 @@ static int check_contig(GpuElemwise *ge, void **args,
       f_contig &= GpuArray_IS_F_CONTIGUOUS(v);
       if (a != v) {
         if (a->nd != v->nd)
-          return error_fmt(ctx->err, GA_INVALID_ERROR, "Mismatched nd for input %u (expected %u, got %u)", i, a->nd, v->nd);
+          return -1; /* We don't check the value of the error code */
         for (j = 0; j < a->nd; j++) {
           if (v->dimensions[j] != a->dimensions[j])
-            return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u (expected %" SPREFIX "u, got %" SPREFIX "u)", j, a->dimensions[j], v->dimensions[j]);
+            return -1; /* We don't check the value of the error code */
         }
       }
     }
diff --git a/tests/check_collectives.c b/tests/check_collectives.c
@@ -49,7 +49,6 @@ extern void teardown_comm(void);
   int(*EXP)[(outcols)];                                                      \
   size_t indims[ND];                                                         \
   size_t outdims[ND];                                                        \
-  const ssize_t instrds[ND] = {sizeof(*A), sizeof(int)};                     \
   const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)};                  \
   int err;                                                                   \
   size_t i, j, outsize;                                                      \