Skip to content

Commit 9885f08

Browse files
authored
Merge pull request #502 from abergeron/cuda9
Changes for CUDA 9.0 float16 support.
2 parents 351f359 + 8ac2448 commit 9885f08

File tree

8 files changed

+328
-359
lines changed

8 files changed

+328
-359
lines changed

pygpu/collectives.pyx

Lines changed: 0 additions & 35 deletions
Original file line numberDiff line numberDiff line change
@@ -38,41 +38,6 @@ cdef class GpuCommCliqueId:
3838
if comm_id is not None:
3939
self.comm_id = comm_id
4040

41-
def __getbuffer__(self, Py_buffer* buffer, int flags):
42-
if buffer == NULL:
43-
raise BufferError, "NULL buffer view in getbuffer"
44-
45-
buffer.buf = <char*>self.c_comm_id.internal
46-
buffer.obj = self
47-
buffer.len = GA_COMM_ID_BYTES * sizeof(char)
48-
buffer.readonly = 0
49-
buffer.itemsize = sizeof(char)
50-
if flags & PyBUF_FORMAT == PyBUF_FORMAT:
51-
buffer.format = 'b'
52-
else:
53-
buffer.format = NULL
54-
buffer.ndim = 1
55-
if flags & PyBUF_ND == PyBUF_ND:
56-
buffer.shape = <Py_ssize_t*>calloc(1, sizeof(Py_ssize_t))
57-
buffer.shape[0] = GA_COMM_ID_BYTES
58-
else:
59-
buffer.shape = NULL
60-
if flags & PyBUF_STRIDES == PyBUF_STRIDES:
61-
buffer.strides = &buffer.itemsize
62-
else:
63-
buffer.strides = NULL
64-
buffer.suboffsets = NULL
65-
buffer.internal = NULL
66-
Py_INCREF(self)
67-
68-
def __releasebuffer__(self, Py_buffer* buffer):
69-
if buffer == NULL:
70-
raise BufferError, "NULL buffer view in releasebuffer"
71-
72-
if buffer.shape != NULL:
73-
free(buffer.shape)
74-
Py_DECREF(self)
75-
7641
def __richcmp__(this, that, int op):
7742
if type(this) != type(that):
7843
raise TypeError, "Cannot compare %s with %s" % (type(this), type(that))

pygpu/tests/collectives/test_collectives.py renamed to pygpu/tests/test_collectives.py

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -88,13 +88,6 @@ def test_richcmp(self):
8888
with self.assertRaises(TypeError):
8989
a = cid2 > "asdfasfa"
9090

91-
def test_as_buffer(self):
92-
a = np.asarray(self.cid)
93-
assert np.allclose(a, self.cid.comm_id)
94-
a[:] = [ord(b'a')] * COMM_ID_BYTES
95-
assert np.allclose(a, self.cid.comm_id)
96-
97-
9891
@unittest.skipUnless(MPI_IMPORTED, "Needs mpi4py module")
9992
@unittest.skipIf(get_user_gpu_rank() == -1, "Collective operations supported on CUDA devices only")
10093
class TestGpuComm(unittest.TestCase):
@@ -293,19 +286,19 @@ def test_all_gather(self):
293286

294287
a = cpu.reshape((5, 2), order='F')
295288
exp = texp.reshape((5, 2 * self.size), order='F')
296-
gpu = gpuarray.asarray(a, context=self.ctx)
289+
gpu = gpuarray.asarray(a, context=self.ctx, order='F')
297290
resgpu = self.gpucomm.all_gather(gpu, nd_up=0)
298291
check_all(resgpu, exp)
299292

300293
a = cpu.reshape((5, 2), order='F')
301294
exp = texp.reshape((5, 2, self.size), order='F')
302-
gpu = gpuarray.asarray(a, context=self.ctx)
295+
gpu = gpuarray.asarray(a, context=self.ctx, order='F')
303296
resgpu = self.gpucomm.all_gather(gpu, nd_up=1)
304297
check_all(resgpu, exp)
305298

306299
a = cpu.reshape((5, 2), order='F')
307300
exp = texp.reshape((5, 2, 1, 1, self.size), order='F')
308-
gpu = gpuarray.asarray(a, context=self.ctx)
301+
gpu = gpuarray.asarray(a, context=self.ctx, order='F')
309302
resgpu = self.gpucomm.all_gather(gpu, nd_up=3)
310303
check_all(resgpu, exp)
311304

src/cluda_cuda.h

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -65,10 +65,14 @@ struct ga_half {
6565
ga_ushort data;
6666
};
6767

68-
#define ga_half2float(p) __half2float((p).data)
69-
__device__ static inline ga_half ga_float2half(float f) {
68+
static __device__ inline float ga_half2float(ga_half h) {
69+
float r;
70+
asm("{ cvt.f32.f16 %0, %1; }\n" : "=f"(r) : "h"(h.data));
71+
return r;
72+
}
73+
static __device__ inline ga_half ga_float2half(float f) {
7074
ga_half r;
71-
r.data = __float2half_rn(f);
75+
asm("{ cvt.rn.f16.f32 %0, %1; }\n" : "=h"(r.data) : "f"(f));
7276
return r;
7377
}
7478

@@ -142,7 +146,7 @@ __device__ ga_half atom_add_eg(ga_half *addr, ga_half val) {
142146
do {
143147
assumed = old;
144148
tmp.data = __byte_perm(old, 0, ((ga_size)addr & 2) ? 0x4432 : 0x4410);
145-
sum = __float2half_rn(__half2float(val.data) + __half2float(tmp.data));
149+
sum = ga_float2half(ga_half2float(val) + ga_half2float(tmp)).data;
146150
new_ = __byte_perm(old, sum, ((ga_size)addr & 2) ? 0x5410 : 0x3254);
147151
old = atomicCAS(base, assumed, new_);
148152
} while (assumed != old);

src/cluda_cuda.h.c

Lines changed: 309 additions & 298 deletions
Large diffs are not rendered by default.

src/gpuarray/ext_cuda.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define LIBGPU_EXT_CUDA
33

44
#include <cuda.h>
5+
#include <cuda_fp16.h>
56

67
#include <gpuarray/config.h>
78
#include <gpuarray/buffer.h>

src/gpuarray_buffer_cuda.c

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22

33
#include "private.h"
44
#include "private_cuda.h"
5+
56
#include "loaders/libnvrtc.h"
67
#include "loaders/libcublas.h"
78

@@ -1087,9 +1088,11 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
10871088
size_t buflen;
10881089
const char *heads[1] = {"cluda.h"};
10891090
const char *hsrc[1];
1090-
const char *opts[4] = {
1091+
const char *opts[] = {
10911092
"-arch", ""
1093+
#ifdef DEBUG
10921094
, "-G", "-lineinfo"
1095+
#endif
10931096
};
10941097
nvrtcResult err;
10951098

@@ -1100,13 +1103,7 @@ static int call_compiler(cuda_context *ctx, strb *src, strb *ptx, strb *log) {
11001103
if (err != NVRTC_SUCCESS)
11011104
return error_nvrtc(ctx->err, "nvrtcCreateProgram", err);
11021105

1103-
err = nvrtcCompileProgram(prog,
1104-
#ifdef DEBUG
1105-
4,
1106-
#else
1107-
2,
1108-
#endif
1109-
opts);
1106+
err = nvrtcCompileProgram(prog, sizeof(opts)/sizeof(char *), opts);
11101107

11111108
/* Get the log before handling the error */
11121109
if (nvrtcGetProgramLogSize(prog, &buflen) == NVRTC_SUCCESS) {

src/gpuarray_elemwise.c

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -547,7 +547,6 @@ static int gen_elemwise_contig_kernel(GpuKernel *k,
547547
static int check_contig(GpuElemwise *ge, void **args,
548548
size_t *_n, int *contig) {
549549
GpuArray *a = NULL, *v;
550-
gpucontext *ctx = GpuKernel_context(&ge->k_contig);
551550
size_t n = 1;
552551
unsigned int i, j;
553552
int c_contig = 1, f_contig = 1;
@@ -563,10 +562,10 @@ static int check_contig(GpuElemwise *ge, void **args,
563562
f_contig &= GpuArray_IS_F_CONTIGUOUS(v);
564563
if (a != v) {
565564
if (a->nd != v->nd)
566-
return error_fmt(ctx->err, GA_INVALID_ERROR, "Mismatched nd for input %u (expected %u, got %u)", i, a->nd, v->nd);
565+
return -1; /* We don't check the value of the error code */
567566
for (j = 0; j < a->nd; j++) {
568567
if (v->dimensions[j] != a->dimensions[j])
569-
return error_fmt(ctx->err, GA_VALUE_ERROR, "Mismatched dimension %u (expected %" SPREFIX "u, got %" SPREFIX "u)", j, a->dimensions[j], v->dimensions[j]);
568+
return -1; /* We don't check the value of the error code */
570569
}
571570
}
572571
}

tests/check_collectives.c

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,6 @@ extern void teardown_comm(void);
4949
int(*EXP)[(outcols)]; \
5050
size_t indims[ND]; \
5151
size_t outdims[ND]; \
52-
const ssize_t instrds[ND] = {sizeof(*A), sizeof(int)}; \
5352
const ssize_t outstrds[ND] = {sizeof(*RES), sizeof(int)}; \
5453
int err; \
5554
size_t i, j, outsize; \

0 commit comments

Comments
 (0)