update brainpylib

chaoming0625 · chaoming0625 · commit 359dcbeddeda · 2022-10-02T13:22:30.000+08:00
diff --git a/brainpy/dyn/base.py b/brainpy/dyn/base.py
@@ -756,8 +756,8 @@ def __init__(
 
   def __repr__(self):
     names = self.__class__.__name__
-    return (f'{names}(name={self.name}, mode={self.mode}, '
-            f'{" " * len(names)} pre={self.pre}, '
+    return (f'{names}(name={self.name}, mode={self.mode}, \n'
+            f'{" " * len(names)} pre={self.pre}, \n'
             f'{" " * len(names)} post={self.post})')
 
   def check_pre_attrs(self, *attrs):
diff --git a/brainpy/dyn/layers/dropout.py b/brainpy/dyn/layers/dropout.py
@@ -15,10 +15,6 @@ class Dropout(DynamicalSystem):
   In training, to compensate for the fraction of input values dropped (`rate`),
   all surviving values are multiplied by `1 / (1 - rate)`.
 
-  The parameter `shared_axes` allows to specify a list of axes on which
-  the mask will be shared: we will use size 1 on those axes for dropout mask
-  and broadcast it. Sharing reduces randomness, but can save memory.
-
   This layer is active only during training (`mode='train'`). In other
   circumstances it is a no-op.
 
diff --git a/extensions/brainpylib/atomic_sum.py b/extensions/brainpylib/atomic_sum.py
@@ -115,7 +115,8 @@ def _atomic_sum_translation(c, values, pre_ids, post_ids, *, post_num, platform=
         shape_with_layout=x_shape(np.dtype(values_dtype), (post_num,), (0,)),
       )
   elif platform == 'gpu':
-    if gpu_ops is None: raise ValueError('Cannot find compiled gpu wheels.')
+    if gpu_ops is None:
+      raise ValueError('Cannot find compiled gpu wheels.')
 
     opaque = gpu_ops.build_atomic_sum_descriptor(conn_size, post_num)
     if values_dim[0] != 1:
diff --git a/extensions/brainpylib/event_sum.py b/extensions/brainpylib/event_sum.py
@@ -7,14 +7,17 @@
 
 from functools import partial
 
+from typing import Union, Tuple
 import jax.numpy as jnp
 import numpy as np
-from jax import core
+from jax import core, dtypes
 from jax.abstract_arrays import ShapedArray
 from jax.interpreters import xla, batching
 from jax.lax import scan
 from jax.lib import xla_client
 
+from .utils import GPUOperatorNotFound
+
 try:
   from . import gpu_ops
 except ImportError:
@@ -26,7 +29,10 @@
 _event_sum_prim = core.Primitive("event_sum")
 
 
-def event_sum(events, pre2post, post_num, values):
+def event_sum(events: jnp.ndarray,
+              pre2post: Tuple[jnp.ndarray, jnp.ndarray],
+              post_num: int,
+              values: Union[float, jnp.ndarray]):
   # events
   if events.dtype != jnp.bool_:
     raise ValueError(f'"events" must be a vector of bool, while we got {events.dtype}')
@@ -39,17 +45,16 @@ def event_sum(events, pre2post, post_num, values):
   if indices.dtype != indptr.dtype:
     raise ValueError(f"The dtype of pre2post[0] must be equal to that of pre2post[1], "
                      f"while we got {(indices.dtype, indptr.dtype)}")
-  if indices.dtype not in [jnp.uint32, jnp.uint64]:
-    raise ValueError(f'The dtype of pre2post must be uint32 or uint64, while we got {indices.dtype}')
+  if indices.dtype not in [jnp.uint32, jnp.uint64, jnp.int32, jnp.int64]:
+    raise ValueError(f'The dtype of pre2post must be integer, while we got {indices.dtype}')
 
   # output value
-  values = jnp.asarray([values])
-  if values.dtype not in [jnp.float32, jnp.float64]:
-    raise ValueError(f'The dtype of "values" must be float32 or float64, while we got {values.dtype}.')
-  if values.size not in [1, indices.size]:
+  dtype = values.dtype if isinstance(values, jnp.ndarray) else dtypes.canonicalize_dtype(type(values))
+  if dtype not in [jnp.float32, jnp.float64]:
+    raise ValueError(f'The dtype of "values" must be float32 or float64, while we got {dtype}.')
+  if np.size(values) not in [1, indices.size]:
     raise ValueError(f'The size of "values" must be 1 (a scalar) or len(pre2post[0]) (a vector), '
-                     f'while we got {values.size} != 1 != {indices.size}')
-  values = values.flatten()
+                     f'while we got {np.size(values)} != 1 != {indices.size}')
   # bind operator
   return _event_sum_prim.bind(events, indices, indptr, values, post_num=post_num)
 
@@ -58,34 +63,27 @@ def _event_sum_abstract(events, indices, indptr, values, *, post_num):
   return ShapedArray(dtype=values.dtype, shape=(post_num,))
 
 
-_event_sum_prim.def_abstract_eval(_event_sum_abstract)
-_event_sum_prim.def_impl(partial(xla.apply_primitive, _event_sum_prim))
-
-
 def _event_sum_translation(c, events, indices, indptr, values, *, post_num, platform="cpu"):
-  # The pre/post shape
+  # The shape of pre/post
   pre_size = np.array(c.get_shape(events).dimensions()[0], dtype=np.uint32)
   _pre_shape = x_shape(np.dtype(np.uint32), (), ())
   _post_shape = x_shape(np.dtype(np.uint32), (), ())
 
   # The indices shape
   indices_shape = c.get_shape(indices)
   Itype = indices_shape.element_type()
-  assert Itype in [np.uint32, np.uint64]
 
   # The value shape
   values_shape = c.get_shape(values)
   Ftype = values_shape.element_type()
-  assert Ftype in [np.float32, np.float64]
   values_dim = values_shape.dimensions()
 
   # We dispatch a different call depending on the dtype
-  f_type = b'_f32' if Ftype == np.float32 else b'_f64'
-  i_type = b'_i32' if Itype == np.uint32 else b'_i64'
+  f_type = b'_f32' if Ftype in np.float32 else b'_f64'
+  i_type = b'_i32' if Itype in [np.uint32, np.int32] else b'_i64'
 
-  # And then the following is what changes between the GPU and CPU
   if platform == "cpu":
-    v_type = b'_event_sum_homo' if values_dim[0] == 1 else b'_event_sum_heter'
+    v_type = b'_event_sum_homo' if len(values_dim) == 0 else b'_event_sum_heter'
     return x_ops.CustomCallWithLayout(
       c,
       platform.encode() + v_type + f_type + i_type,
@@ -103,9 +101,12 @@ def _event_sum_translation(c, events, indices, indptr, values, *, post_num, plat
                                   c.get_shape(values)),
       shape_with_layout=x_shape(np.dtype(Ftype), (post_num,), (0,)),
     )
+
+  # GPU platform
   elif platform == 'gpu':
     if gpu_ops is None:
-      raise ValueError('Cannot find compiled gpu wheels.')
+      raise GPUOperatorNotFound('event_sum')
+
     v_type = b'_event_sum_homo' if values_dim[0] == 1 else b'_event_sum_heter'
     opaque = gpu_ops.build_event_sum_descriptor(pre_size, post_num)
     return x_ops.CustomCallWithLayout(
@@ -127,11 +128,7 @@ def _event_sum_translation(c, events, indices, indptr, values, *, post_num, plat
     raise ValueError("Unsupported platform, we only support 'cpu' or 'gpu'")
 
 
-xla.backend_specific_translations["cpu"][_event_sum_prim] = partial(_event_sum_translation, platform="cpu")
-xla.backend_specific_translations["gpu"][_event_sum_prim] = partial(_event_sum_translation, platform="gpu")
-
-
-def _event_sum_batch(args, axes):
+def _event_sum_batch(args, axes, *, post_num):
   batch_axes, batch_args, non_batch_args = [], {}, {}
   for ax_i, ax in enumerate(axes):
     if ax is None:
@@ -143,19 +140,22 @@ def _event_sum_batch(args, axes):
   def f(_, x):
     pars = tuple([(x[f'ax{i}'] if i in batch_axes else non_batch_args[f'ax{i}'])
                   for i in range(len(axes))])
-    return 0, _event_sum_prim.bind(*pars)
+    return 0, _event_sum_prim.bind(*pars, post_num=post_num)
+
   _, outs = scan(f, 0, batch_args)
   return outs, 0
 
 
+_event_sum_prim.def_abstract_eval(_event_sum_abstract)
+_event_sum_prim.def_impl(partial(xla.apply_primitive, _event_sum_prim))
 batching.primitive_batchers[_event_sum_prim] = _event_sum_batch
-
+xla.backend_specific_translations["cpu"][_event_sum_prim] = partial(_event_sum_translation, platform="cpu")
+xla.backend_specific_translations["gpu"][_event_sum_prim] = partial(_event_sum_translation, platform="gpu")
 
 # ---------------------------
 # event sum kernel 2
 # ---------------------------
 
-
 _event_sum2_prim = core.Primitive("event_sum2")
 
 
diff --git a/extensions/brainpylib/utils.py b/extensions/brainpylib/utils.py
@@ -0,0 +1,18 @@
+# -*- coding: utf-8 -*-
+
+
+__all__ = [
+  'GPUOperatorNotFound',
+]
+
+
+class GPUOperatorNotFound(Exception):
+  def __init__(self, name):
+    super(GPUOperatorNotFound, self).__init__(f'''
+GPU operator for "{name}" does not found. 
+
+Please compile brainpylib GPU operators with the guidance in the following link:
+
+https://brainpy.readthedocs.io/en/latest/tutorial_advanced/compile_brainpylib.html
+    ''')
+
diff --git a/extensions/lib/event_sum_cpu.cc b/extensions/lib/event_sum_cpu.cc
@@ -4,44 +4,82 @@ namespace brainpy_lib {
 namespace{
     template <typename F, typename I>
     void cpu_event_sum_homo(void *out, const void **in) {
-      // Parse the inputs
       const std::uint32_t pre_size = *reinterpret_cast<const std::uint32_t *>(in[0]);
       const std::uint32_t post_size = *reinterpret_cast<const std::uint32_t *>(in[1]);
       const bool *events = reinterpret_cast<const bool *>(in[2]);
       const I *indices = reinterpret_cast<const I *>(in[3]);
       const I *indptr = reinterpret_cast<const I *>(in[4]);
-      const F *values = reinterpret_cast<const F *>(in[5]);
-      const F value = values[0];
+      const F weight = *reinterpret_cast<const F *>(in[5]);
+      F *result = reinterpret_cast<F *>(out);
 
-      // The output
+      // algorithm
+      memset(&result[0], 0, sizeof(F) * post_size);
+      for (std::uint32_t i=0; i<pre_size; ++i) {
+        if (events[i]){
+          for (I j=indptr[i]; j<indptr[i+1]; ++j) {
+            result[indices[j]] += weight;
+          }
+        }
+      }
+    }
+
+    // TODO:: batch version of "event_sum_homo" CPU operator
+    template <typename F, typename I>
+    void cpu_event_sum_batch_homo(void *out, const void **in) {
+      const std::uint32_t pre_size = *reinterpret_cast<const std::uint32_t *>(in[0]);
+      const std::uint32_t post_size = *reinterpret_cast<const std::uint32_t *>(in[1]);
+      const bool *events = reinterpret_cast<const bool *>(in[2]);
+      const I *indices = reinterpret_cast<const I *>(in[3]);
+      const I *indptr = reinterpret_cast<const I *>(in[4]);
+      const F weight = *reinterpret_cast<const F *>(in[5]);
       F *result = reinterpret_cast<F *>(out);
 
       // algorithm
-      memset(&result[0], 0, sizeof(result[0]) * post_size);
+      memset(&result[0], 0, sizeof(F) * post_size);
       for (std::uint32_t i=0; i<pre_size; ++i) {
         if (events[i]){
           for (I j=indptr[i]; j<indptr[i+1]; ++j) {
-            result[indices[j]] += value;
+            result[indices[j]] += weight;
           }
         }
       }
     }
 
     template <typename F, typename I>
     void cpu_event_sum_heter(void *out, const void **in) {
-      // Parse the inputs
       const std::uint32_t pre_size = *reinterpret_cast<const std::uint32_t *>(in[0]);
       const std::uint32_t post_size = *reinterpret_cast<const std::uint32_t *>(in[1]);
       const bool *events = reinterpret_cast<const bool *>(in[2]);
       const I *indices = reinterpret_cast<const I *>(in[3]);
       const I *indptr = reinterpret_cast<const I *>(in[4]);
       const F *values = reinterpret_cast<const F *>(in[5]);
+      F *result = reinterpret_cast<F *>(out);
+
+      // algorithm
+      memset(&result[0], 0, sizeof(F) * post_size);
+      for (std::uint32_t i = 0; i < pre_size; ++i) {
+        if (events[i]){
+          for (I j = indptr[i]; j < indptr[i+1]; ++j) {
+            result[indices[j]] += values[j];
+          }
+        }
+      }
+    }
+
 
-      // The output
+    // TODO:: batch version of "event_sum_heter" CPU operator
+    template <typename F, typename I>
+    void cpu_event_sum_batch_heter(void *out, const void **in) {
+      const std::uint32_t pre_size = *reinterpret_cast<const std::uint32_t *>(in[0]);
+      const std::uint32_t post_size = *reinterpret_cast<const std::uint32_t *>(in[1]);
+      const bool *events = reinterpret_cast<const bool *>(in[2]);
+      const I *indices = reinterpret_cast<const I *>(in[3]);
+      const I *indptr = reinterpret_cast<const I *>(in[4]);
+      const F *values = reinterpret_cast<const F *>(in[5]);
       F *result = reinterpret_cast<F *>(out);
 
       // algorithm
-      memset(&result[0], 0, sizeof(result[0]) * post_size);
+      memset(&result[0], 0, sizeof(F) * post_size);
       for (std::uint32_t i = 0; i < pre_size; ++i) {
         if (events[i]){
           for (I j = indptr[i]; j < indptr[i+1]; ++j) {
@@ -50,6 +88,8 @@ namespace{
         }
       }
     }
+
+
 }
 
 void cpu_event_sum_homo_f32_i32(void *out, const void **in){cpu_event_sum_homo<float, std::uint32_t>(out, in);}
diff --git a/extensions/lib/event_sum_gpu.cu b/extensions/lib/event_sum_gpu.cu
@@ -458,8 +458,7 @@ namespace brainpy_lib {
                     if (threadIdx.x < num_event) {
                         const unsigned int pre_i = (r * 32) + threadIdx.x;
                         shared_events[threadIdx.x] = events[pre_i];
-                        if (shared_events[threadIdx.x])
-                        {
+                        if (shared_events[threadIdx.x]) {
                             shPreStartID[threadIdx.x] = indptr[pre_i];
                             shRowLength[threadIdx.x] = indptr[pre_i + 1] - shPreStartID[threadIdx.x];
                         }
@@ -532,8 +531,7 @@ namespace brainpy_lib {
                     if (threadIdx.x < num_event) {
                         const unsigned int pre_i = (r * 32) + threadIdx.x;
                         shared_events[threadIdx.x] = events[pre_i];
-                        if (shared_events[threadIdx.x])
-                        {
+                        if (shared_events[threadIdx.x]) {
                             shPreStartID[threadIdx.x] = indptr[pre_i];
                             shRowLength[threadIdx.x] = indptr[pre_i + 1] - shPreStartID[threadIdx.x];
                         }
@@ -553,7 +551,6 @@ namespace brainpy_lib {
         }
 
 
-
         template<typename F, typename I>
         inline void gpu_event_sum4_heter(cudaStream_t stream,
                                          void **buffers,
@@ -578,17 +575,16 @@ namespace brainpy_lib {
             cudaMemset(result, 0, sizeof(F) * post_size);
             event_sum4_heter_kernel<F, I><<<grid_dim, block_dim,
             /*dynamic_shared_mem_bytes=*/0, stream>>>(max_post_conn,
-                                                                              pre_size,
-                                                                              events,
-                                                                              indices,
-                                                                              indptr,
-                                                                              values,
-                                                                              result);
+                                                      pre_size,
+                                                      events,
+                                                      indices,
+                                                      indptr,
+                                                      values,
+                                                      result);
             ThrowIfError(cudaGetLastError());
         }
 
 
-
     }  // namespace
 
 
@@ -758,24 +754,15 @@ namespace brainpy_lib {
     }
 
     // heterogeneous event sum 3
-    void gpu_event_sum3_heter_f32_i32(cudaStream_t stream,
-                                      void **buffers,
-                                      const char *opaque,
-                                      std::size_t opaque_len) {
+    void gpu_event_sum3_heter_f32_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) {
         gpu_event_sum3_heter<float, std::uint32_t>(stream, buffers, opaque, opaque_len);
     }
 
-    void gpu_event_sum3_heter_f32_i64(cudaStream_t stream,
-                                      void **buffers,
-                                      const char *opaque,
-                                      std::size_t opaque_len) {
+    void gpu_event_sum3_heter_f32_i64(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) {
         gpu_event_sum3_heter<float, std::uint64_t>(stream, buffers, opaque, opaque_len);
     }
 
-    void gpu_event_sum3_heter_f64_i32(cudaStream_t stream,
-                                      void **buffers,
-                                      const char *opaque,
-                                      std::size_t opaque_len) {
+    void gpu_event_sum3_heter_f64_i32(cudaStream_t stream, void **buffers, const char *opaque, std::size_t opaque_len) {
         gpu_event_sum3_heter<double, std::uint32_t>(stream, buffers, opaque, opaque_len);
     }
 
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -115,7 +115,8 @@ def _atomic_sum_translation(c, values, pre_ids, post_ids, *, post_num, platform=`
`115`	`115`	`shape_with_layout=x_shape(np.dtype(values_dtype), (post_num,), (0,)),`
`116`	`116`	`)`
`117`	`117`	`elif platform == 'gpu':`
`118`		`- if gpu_ops is None: raise ValueError('Cannot find compiled gpu wheels.')`
	`118`	`+ if gpu_ops is None:`
	`119`	`+ raise ValueError('Cannot find compiled gpu wheels.')`
`119`	`120`
`120`	`121`	`opaque = gpu_ops.build_atomic_sum_descriptor(conn_size, post_num)`
`121`	`122`	`if values_dim[0] != 1:`