Remove reset and rework inits.

lucasb-eyer · lucasb-eyer · commit 2e543218401c · 2015-07-14T15:08:24.000+02:00
This gets rid of `reset` as discussed in #15 and makes parameter initialization more modular. (More inits to follow.)
diff --git a/beacon8/containers/Container.py b/beacon8/containers/Container.py
@@ -37,4 +37,4 @@ def get_stat_updates(self):
         return stat_updates
 
     def add(self, module):
-        self.modules.append(module)
+        self.modules.append(module)
diff --git a/beacon8/init/Const.py b/beacon8/init/Const.py
@@ -0,0 +1,10 @@
+import numpy as _np
+
+
+def const(value):
+    def init(shape, fan):
+        return _np.full(shape, value)
+    return init
+
+zero = const(0)
+one = const(1)
diff --git a/beacon8/init/Xavier.py b/beacon8/init/Xavier.py
@@ -0,0 +1,7 @@
+import numpy as _np
+
+def xavier(shape, fan):
+    assert fan is not None, "The parameter's `fan` needs to be specified when using Xavier initialization."
+
+    w_bound = _np.sqrt(4. / sum(fan))
+    return _np.random.uniform(low=-w_bound, high=w_bound, size=shape)
diff --git a/beacon8/init/__init__.py b/beacon8/init/__init__.py
@@ -0,0 +1,2 @@
+from .Const import const, zero, one
+from .Xavier import xavier
diff --git a/beacon8/layers/BatchNormalization.py b/beacon8/layers/BatchNormalization.py
@@ -1,4 +1,6 @@
 from .Module import Module
+from beacon8.init import zero, one
+from beacon8.utils import create_param, create_param_and_grad
 
 import numpy as _np
 import theano as _th
@@ -9,17 +11,16 @@ class BatchNormalization(Module):
     def __init__(self, n_features, eps=None):
         Module.__init__(self)
 
-        self.weight = _th.shared(_np.ones(shape=(n_features,), dtype=_th.config.floatX))
-        self.bias = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
-        self.grad_weight = _th.shared(_np.zeros(shape=(n_features,), dtype=_th.config.floatX))
-        self.grad_bias = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
+        self.weight, self.grad_weight = create_param_and_grad(n_features, one, 'W_BN')
+        self.bias, self.grad_bias = create_param_and_grad(n_features, zero, 'b_BN')
 
-        self.inference_weight = _th.shared(_np.ones(shape=(n_features,), dtype=_th.config.floatX))
-        self.inference_bias = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
+        self.inference_weight = create_param(n_features, one, 'W_BN_inf')
+        self.inference_bias = create_param(n_features, zero, 'b_BN_inf')
 
-        self.buffer_variance = _th.shared(_np.ones(shape=(n_features, ), dtype=_th.config.floatX))
-        self.buffer_mean = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
-        self.buffer_counts = _th.shared(_np.asarray(0., dtype=_th.config.floatX))
+        # These are buffers for collecting the minibatch statistics.
+        self.buffer_variance = create_param(n_features, one, 'BN_var')
+        self.buffer_mean = create_param(n_features, zero, 'BN_mean')
+        self.buffer_counts = _th.shared(_np.asarray(0, dtype=_th.config.floatX))
 
         self.eps = eps or 1e-5
 
diff --git a/beacon8/layers/Linear.py b/beacon8/layers/Linear.py
@@ -1,35 +1,23 @@
 from .Module import Module
+from beacon8.init import zero, xavier
+from beacon8.utils import create_param_and_grad
 
 import numpy as _np
 import theano as _th
 
 
 class Linear(Module):
 
-    def __init__(self, nin, nout, init='Xavier', with_bias=True):
+    def __init__(self, nin, nout, init=xavier, with_bias=True, init_b=zero):
         Module.__init__(self)
 
         self.nin = nin
         self.nout = nout
-        self.init = init
         self.with_bias = with_bias
 
-        self.reset()
-
-    def reset(self):
-        if self.init == 'Xavier':
-            w_bound = _np.sqrt(4. / (self.nin + self.nout))
-            W = _np.random.uniform(low=-w_bound, high=w_bound,
-                                   size=(self.nin, self.nout))
-        else:
-            raise NotImplementedError
-
-        self.weight = _th.shared(W.astype(_th.config.floatX))
-        self.grad_weight = _th.shared((W*0.).astype(_th.config.floatX))
-
+        self.weight, self.grad_weight = create_param_and_grad((nin, nout), init, fan=(nin, nout), name='Wlin_{}x{}'.format(nin, nout))
         if self.with_bias:
-            self.bias = _th.shared(_np.zeros(shape=self.nout, dtype=_th.config.floatX))
-            self.grad_bias = _th.shared(_np.zeros(shape=self.nout, dtype=_th.config.floatX))
+            self.bias, self.grad_bias = create_param_and_grad(nout, init_b, name='blin_{}'.format(nout))
 
     def symb_forward(self, symb_input):
         out = _th.tensor.dot(symb_input, self.weight)
diff --git a/beacon8/layers/Module.py b/beacon8/layers/Module.py
@@ -11,9 +11,6 @@ def __init__(self):
         self.fn_accum_grads = None
         self.fn_accum_stats = None
 
-    def reset(self):
-        pass
-
     #def __hash__(self):
     #    raise NotImplementedError("You *need* to reimplement hash, even if it's just python's default. See the documentation for more info.")
 
diff --git a/beacon8/layers/SpatialConvolution.py b/beacon8/layers/SpatialConvolution.py
@@ -1,11 +1,13 @@
 from .Module import Module
+from beacon8.init import zero, xavier
+from beacon8.utils import create_param_and_grad
 
 import theano as _th
 import numpy as _np
 
 
 class SpatialConvolution(Module):
-    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_bias=True, border_mode='valid', imshape=None):
+    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_bias=True, border_mode='valid', imshape=None, init=xavier, init_b=zero):
         Module.__init__(self)
         self.n_input_plane = n_input_plane
         self.n_output_plane = n_output_plane
@@ -17,19 +19,17 @@ def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_b
         self.border_mode = border_mode
         self.imshape = imshape
 
-        w_bound = _np.sqrt(4. / ((self.n_input_plane + self.n_output_plane) * self.k_w * self.k_h))
-        W = _np.random.uniform(low=-w_bound, high=w_bound, size=(n_output_plane, n_input_plane, k_h, k_w))
-        self.weight = _th.shared(W.astype(dtype=_th.config.floatX))
-        self.grad_weight = _th.shared((W*0).astype(dtype=_th.config.floatX))
+        self.w_shape = (n_output_plane, n_input_plane, k_h, k_w)
+        w_fan = (n_input_plane*k_w*k_h, n_output_plane*k_w*k_h)
 
+        self.weight, self.grad_weight = create_param_and_grad(self.w_shape, init, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
         if self.with_bias:
-            self.bias = _th.shared(_np.zeros(shape=(n_output_plane, ), dtype=_th.config.floatX))
-            self.grad_bias = _th.shared(_np.zeros(shape=(n_output_plane, ), dtype=_th.config.floatX))
+            self.bias, self.grad_bias = create_param_and_grad(n_output_plane, init_b, name='bconv_{}'.format(n_output_plane))
 
     def symb_forward(self, symb_input):
         conv_output = _th.tensor.nnet.conv.conv2d(symb_input, self.weight,
             image_shape=(None, self.n_input_plane) + (self.imshape or (None, None)),
-            filter_shape=(self.n_output_plane, self.n_input_plane, self.k_h, self.k_w),
+            filter_shape=self.w_shape,
             border_mode=self.border_mode,
             subsample=(self.d_h, self.d_w)
         )
@@ -38,4 +38,3 @@ def symb_forward(self, symb_input):
             return conv_output + self.bias.dimshuffle('x', 0, 'x', 'x')
         else:
             return conv_output
-
diff --git a/beacon8/layers/SpatialConvolutionCUDNN.py b/beacon8/layers/SpatialConvolutionCUDNN.py
@@ -1,12 +1,14 @@
+from .Module import Module
+from beacon8.init import zero, xavier
+from beacon8.utils import create_param_and_grad
+
 import theano as _th
 import numpy as _np
 import theano.sandbox.cuda.dnn as _dnn
 
-from .Module import Module
-
 
 class SpatialConvolutionCUDNN(Module):
-    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, pad_w=0, pad_h=0, with_bias=True):
+    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, pad_w=0, pad_h=0, with_bias=True, init=xavier, init_b=zero):
         Module.__init__(self)
         self.n_input_plane = n_input_plane
         self.n_output_plane = n_output_plane
@@ -17,15 +19,13 @@ def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, pad_w=
         self.pad_w = pad_w
         self.pad_h = pad_h
         self.with_bias = with_bias
-        w_bound = _np.sqrt(4. / ((self.n_input_plane + self.n_output_plane) * self.k_w * self.k_h))
 
-        W = _np.random.uniform(low=-w_bound, high=w_bound, size=(n_output_plane, n_input_plane, k_h, k_w))
-        self.weight = _th.shared(W.astype(dtype=_th.config.floatX))
-        self.grad_weight = _th.shared((W*0).astype(dtype=_th.config.floatX))
+        w_shape = (n_output_plane, n_input_plane, k_h, k_w)
+        w_fan = (n_input_plane*k_w*k_h, n_output_plane*k_w*k_h)
 
+        self.weight, self.grad_weight = create_param_and_grad(w_shape, init, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
         if self.with_bias:
-            self.bias = _th.shared(_np.zeros(shape=(n_output_plane, ), dtype=_th.config.floatX))
-            self.grad_bias = _th.shared(_np.zeros(shape=(n_output_plane, ), dtype=_th.config.floatX))
+            self.bias, self.grad_bias = create_param_and_grad(n_output_plane, init_b, name='bconv_{}'.format(n_output_plane))
 
     def symb_forward(self, symb_input):
         conv_output = _dnn.dnn_conv(img=symb_input,
diff --git a/beacon8/utils.py b/beacon8/utils.py
@@ -1,4 +1,18 @@
 import theano as _th
+import numpy as _np
+
+
+def create_param(shape, init, fan=None, name=None, type=_th.config.floatX):
+    return _th.shared(init(shape, fan).astype(type), name=name)
+
+
+def create_param_and_grad(shape, init, fan=None, name=None, type=_th.config.floatX):
+    val = init(shape, fan).astype(type)
+    param = _th.shared(val, name=name)
+    grad_name = 'grad_' + name if name is not None else None
+    grad_param = _th.shared(_np.zeros_like(val), name=grad_name)
+    return param, grad_param
+
 
 def create_param_state_as(other, initial_value=0):
     return _th.shared(other.get_value()*0 + initial_value,

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,2 @@`
	`1`	`+from .Const import const, zero, one`
	`2`	`+from .Xavier import xavier`