Switch the order of gs and ls to conform to what the underlying APIs use.

abergeron · abergeron · commit 6d449061c3ce · 2017-01-12T13:32:32.000-05:00
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES
   INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib
   MACOSX_RPATH OFF
   # This is the shared library version
-  VERSION 1.0
+  VERSION 2.0
   )
 
 add_library(gpuarray-static STATIC ${GPUARRAY_SRC})
diff --git a/src/gpuarray/buffer.h b/src/gpuarray/buffer.h
@@ -482,15 +482,15 @@ GPUARRAY_PUBLIC int gpukernel_setarg(gpukernel *k, unsigned int i, void *a);
  *
  * \param k kernel
  * \param n number of dimensions of grid/block
- * \param bs block sizes for this call (also known as local size)
  * \param gs grid sizes for this call (also known as global size)
+ * \param ls block sizes for this call (also known as local size)
  * \param shared amount of dynamic shared memory to reserve
  * \param args table of pointers to each argument (optional).
  *
  * \returns GA_NO_ERROR or an error code if an error occurred.
  */
 GPUARRAY_PUBLIC int gpukernel_call(gpukernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
 /**
diff --git a/src/gpuarray/config.h b/src/gpuarray/config.h
@@ -3,7 +3,7 @@
 
 /* The following included file should have been generated by CMake. */
 #include <gpuarray/abi_version.h>
-#define GPUARRAY_API_VERSION 0
+#define GPUARRAY_API_VERSION 1
 
 #ifdef GPUARRAY_SHARED
  #ifdef _WIN32
diff --git a/src/gpuarray/kernel.h b/src/gpuarray/kernel.h
@@ -87,24 +87,24 @@ GPUARRAY_PUBLIC int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *val);
  *
  * \param k the kernel to schedule for
  * \param n number of elements to handle
- * \param ls local size (in/out)
  * \param gs grid size (in/out)
+ * \param ls local size (in/out)
  */
 GPUARRAY_PUBLIC int GpuKernel_sched(GpuKernel *k, size_t n,
-                                    size_t *ls, size_t *gs);
+                                    size_t *gs, size_t *ls);
 
 /**
  * Launch the execution of a kernel.
  *
  * \param k the kernel to launch
  * \param n dimensionality of the grid/blocks
- * \param ls sizes of launch blocks
  * \param gs sizes of launch grid
+ * \param ls sizes of launch blocks
  * \param amount of dynamic shared memory to allocate
  * \param args table of pointers to arguments
  */
 GPUARRAY_PUBLIC int GpuKernel_call(GpuKernel *k, unsigned int n,
-                                   const size_t *ls, const size_t *gs,
+                                   const size_t *gs, const size_t *ls,
                                    size_t shared, void **args);
 
 GPUARRAY_PUBLIC int GpuKernel_binary(const GpuKernel *k, size_t *sz,
diff --git a/src/gpuarray_array.c b/src/gpuarray_array.c
@@ -487,7 +487,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   if (err != GA_NO_ERROR)
     return err;
 
-  err = GpuKernel_sched(&k, n[0]*n[1], &ls[1], &gs[1]);
+  err = GpuKernel_sched(&k, n[0]*n[1], &gs[1], &ls[1]);
   if (err != GA_NO_ERROR)
     goto out;
 
@@ -521,7 +521,7 @@ int GpuArray_take1(GpuArray *a, const GpuArray *v, const GpuArray *i,
   GpuKernel_setarg(&k, argp++, &n[1]);
   GpuKernel_setarg(&k, argp++, errbuf);
 
-  err = GpuKernel_call(&k, 2, ls, gs, 0, NULL);
+  err = GpuKernel_call(&k, 2, gs, ls, 0, NULL);
   if (check_error && err == GA_NO_ERROR) {
     err = gpudata_read(&kerr, errbuf, 0, sizeof(int));
     if (err == GA_NO_ERROR && kerr != 0) {
diff --git a/src/gpuarray_blas_cuda_cublas.c b/src/gpuarray_blas_cuda_cublas.c
@@ -1099,9 +1099,9 @@ static int sgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1223,9 +1223,9 @@ static int dgemvBatch(cb_order order, cb_transpose transA,
   args[8] = &N;
 
   if (transA == cb_no_trans) {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_N_a1_b1_small, 2, gs, ls, 0, args);
   } else {
-    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, ls, gs, 0, args);
+    err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->dgemvBH_T_a1_b1_small, 2, gs, ls, 0, args);
   }
 
   cuda_ops.buffer_release(Aa);
@@ -1486,7 +1486,7 @@ static int sgerBatch(cb_order order, size_t M, size_t N, float alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
@@ -1618,7 +1618,7 @@ static int dgerBatch(cb_order order, size_t M, size_t N, double alpha,
   args[8] = &M;
   args[9] = &N;
 
-  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, ls, gs, 0, args);
+  err = GpuKernel_call(&((blas_handle *)ctx->blas_handle)->sgerBH_gen_small, 3, gs, ls, 0, args);
 
   cuda_ops.buffer_release(Aa);
   cuda_ops.buffer_release(xa);
diff --git a/src/gpuarray_buffer.c b/src/gpuarray_buffer.c
@@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel *k, unsigned int i, void *a) {
   return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);
 }
 
-int gpukernel_call(gpukernel *k, unsigned int n, const size_t *ls,
-                   const size_t *gs, size_t shared, void **args) {
-  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,
+int gpukernel_call(gpukernel *k, unsigned int n, const size_t *gs,
+                   const size_t *ls, size_t shared, void **args) {
+  return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,
                                                          shared, args);
 }
 
diff --git a/src/gpuarray_buffer_cuda.c b/src/gpuarray_buffer_cuda.c
@@ -1237,7 +1237,7 @@ static int cuda_kernelsetarg(gpukernel *k, unsigned int i, void *arg) {
 }
 
 static int cuda_callkernel(gpukernel *k, unsigned int n,
-                           const size_t *bs, const size_t *gs,
+                           const size_t *gs, const size_t *ls,
                            size_t shared, void **args) {
     cuda_context *ctx = k->ctx;
     unsigned int i;
@@ -1258,15 +1258,15 @@ static int cuda_callkernel(gpukernel *k, unsigned int n,
 
     switch (n) {
     case 1:
-      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, bs[0], 1, 1, shared,
+      ctx->err = cuLaunchKernel(k->k, gs[0], 1, 1, ls[0], 1, 1, shared,
                                 ctx->s, args, NULL);
       break;
     case 2:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, bs[0], bs[1], 1, shared,
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], 1, ls[0], ls[1], 1, shared,
                                 ctx->s, args, NULL);
       break;
     case 3:
-      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], bs[0], bs[1], bs[2],
+      ctx->err = cuLaunchKernel(k->k, gs[0], gs[1], gs[2], ls[0], ls[1], ls[2],
                                 shared, ctx->s, args, NULL);
       break;
     default:
diff --git a/src/gpuarray_buffer_opencl.c b/src/gpuarray_buffer_opencl.c
@@ -274,7 +274,7 @@ cl_mem cl_get_buf(gpudata *g) { ASSERT_BUF(g); return g->buf; }
 
 static void cl_releasekernel(gpukernel *k);
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *bs, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args);
 
 static const char CL_PREAMBLE[] =
@@ -748,7 +748,7 @@ static int cl_memset(gpudata *dst, size_t offset, int data) {
   if (res != GA_NO_ERROR) goto fail;
   gs = ((n-1) / ls) + 1;
   args[0] = dst;
-  res = cl_callkernel(m, 1, &ls, &gs, 0, args);
+  res = cl_callkernel(m, 1, &gs, &ls, 0, args);
 
  fail:
   cl_releasekernel(m);
@@ -998,7 +998,7 @@ static int cl_setkernelarg(gpukernel *k, unsigned int i, void *a) {
 }
 
 static int cl_callkernel(gpukernel *k, unsigned int n,
-                         const size_t *ls, const size_t *gs,
+                         const size_t *gs, const size_t *ls,
                          size_t shared, void **args) {
   cl_ctx *ctx = k->ctx;
   size_t _gs[3];
diff --git a/src/gpuarray_elemwise.c b/src/gpuarray_elemwise.c
@@ -414,10 +414,10 @@ static int call_basic(GpuElemwise *ge, void **args, size_t n, unsigned int nd,
     }
   }
 
-  err = GpuKernel_sched(k, n, &ls, &gs);
+  err = GpuKernel_sched(k, n, &gs, &ls);
   if (err != GA_NO_ERROR) goto error;
 
-  err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);
+  err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);
  error:
   return err;
 }
@@ -572,9 +572,9 @@ static int call_contig(GpuElemwise *ge, void **args, size_t n) {
       if (err != GA_NO_ERROR) return err;
     }
   }
-  err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);
+  err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);
   if (err != GA_NO_ERROR) return err;
-  return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);
+  return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);
 }
 
 GpuElemwise *GpuElemwise_new(gpucontext *ctx,
diff --git a/src/gpuarray_kernel.c b/src/gpuarray_kernel.c
@@ -32,7 +32,7 @@ gpucontext *GpuKernel_context(GpuKernel *k) {
   return gpukernel_context(k->k);
 }
 
-int GpuKernel_sched(GpuKernel *k, size_t n, size_t *ls, size_t *gs) {
+int GpuKernel_sched(GpuKernel *k, size_t n, size_t *gs, size_t *ls) {
   size_t min_l;
   size_t max_l;
   size_t target_l;
@@ -90,9 +90,9 @@ int GpuKernel_setarg(GpuKernel *k, unsigned int i, void *a) {
 }
 
 int GpuKernel_call(GpuKernel *k, unsigned int n,
-                   const size_t *bs, const size_t *gs,
+                   const size_t *gs, const size_t *ls,
                    size_t shared, void **args) {
-  return gpukernel_call(k->k, n, bs, gs, shared, args);
+  return gpukernel_call(k->k, n, gs, ls, shared, args);
 }
 
 int GpuKernel_binary(const GpuKernel *k, size_t *sz, void **bin) {
diff --git a/src/gpuarray_reduction.c b/src/gpuarray_reduction.c
@@ -815,8 +815,8 @@ static int   maxandargmaxInvoke                 (maxandargmax_ctx*  ctx){
 	   ctx->dstArgmaxStepsGD){
 		ctx->ret = GpuKernel_call(&ctx->kernel,
 		                          ctx->ndh>0 ? ctx->ndh : 1,
-		                          ctx->blockSize,
 		                          ctx->gridSize,
+		                          ctx->blockSize,
 		                          0,
 		                          args);
 	}else{
diff --git a/src/private.h b/src/private.h
@@ -97,7 +97,7 @@ struct _gpuarray_buffer_ops {
   void (*kernel_release)(gpukernel *k);
   int (*kernel_setarg)(gpukernel *k, unsigned int i, void *a);
   int (*kernel_call)(gpukernel *k, unsigned int n,
-                     const size_t *bs, const size_t *gs,
+                     const size_t *gs, const size_t *ls,
                      size_t shared, void **args);
 
   int (*kernel_binary)(gpukernel *k, size_t *sz, void **obj);

Original file line number	Diff line number	Diff line change
`@@ -88,7 +88,7 @@ set_target_properties(gpuarray PROPERTIES`
`88`	`88`	`INSTALL_NAME_DIR ${CMAKE_INSTALL_PREFIX}/lib`
`89`	`89`	`MACOSX_RPATH OFF`
`90`	`90`	`# This is the shared library version`
`91`		`- VERSION 1.0`
	`91`	`+ VERSION 2.0`
`92`	`92`	`)`
`93`	`93`
`94`	`94`	`add_library(gpuarray-static STATIC ${GPUARRAY_SRC})`
Original file line number	Diff line number	Diff line change
`@@ -180,9 +180,9 @@ int gpukernel_setarg(gpukernel k, unsigned int i, void a) {`
`180`	`180`	`return ((partial_gpukernel *)k)->ctx->ops->kernel_setarg(k, i, a);`
`181`	`181`	`}`
`182`	`182`
`183`		`-int gpukernel_call(gpukernel k, unsigned int n, const size_t ls,`
`184`		`- const size_t gs, size_t shared, void *args) {`
`185`		`- return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, ls, gs,`
	`183`	`+int gpukernel_call(gpukernel k, unsigned int n, const size_t gs,`
	`184`	`+ const size_t ls, size_t shared, void *args) {`
	`185`	`+ return ((partial_gpukernel *)k)->ctx->ops->kernel_call(k, n, gs, ls,`
`186`	`186`	`shared, args);`
`187`	`187`	`}`
`188`	`188`
Original file line number	Diff line number	Diff line change
`@@ -414,10 +414,10 @@ static int call_basic(GpuElemwise ge, void *args, size_t n, unsigned int nd,`
`414`	`414`	`}`
`415`	`415`	`}`
`416`	`416`
`417`		`- err = GpuKernel_sched(k, n, &ls, &gs);`
	`417`	`+ err = GpuKernel_sched(k, n, &gs, &ls);`
`418`	`418`	`if (err != GA_NO_ERROR) goto error;`
`419`	`419`
`420`		`- err = GpuKernel_call(k, 1, &ls, &gs, 0, NULL);`
	`420`	`+ err = GpuKernel_call(k, 1, &gs, &ls, 0, NULL);`
`421`	`421`	`error:`
`422`	`422`	`return err;`
`423`	`423`	`}`
`@@ -572,9 +572,9 @@ static int call_contig(GpuElemwise ge, void *args, size_t n) {`
`572`	`572`	`if (err != GA_NO_ERROR) return err;`
`573`	`573`	`}`
`574`	`574`	`}`
`575`		`- err = GpuKernel_sched(&ge->k_contig, n, &ls, &gs);`
	`575`	`+ err = GpuKernel_sched(&ge->k_contig, n, &gs, &ls);`
`576`	`576`	`if (err != GA_NO_ERROR) return err;`
`577`		`- return GpuKernel_call(&ge->k_contig, 1, &ls, &gs, 0, NULL);`
	`577`	`+ return GpuKernel_call(&ge->k_contig, 1, &gs, &ls, 0, NULL);`
`578`	`578`	`}`
`579`	`579`
`580`	`580`	`GpuElemwise GpuElemwise_new(gpucontext ctx,`