Merge pull request #20 from lucasb-eyer/moar-inits

ikostrikov2 · ikostrikov2 · commit ec2a3d80589f · 2015-07-19T09:31:09.000-07:00
Moar inits
diff --git a/beacon8/init/Const.py b/beacon8/init/Const.py
@@ -5,6 +5,3 @@ def const(value):
     def init(shape, fan):
         return _np.full(shape, value)
     return init
-
-zero = const(0)
-one = const(1)
diff --git a/beacon8/init/Normal.py b/beacon8/init/Normal.py
@@ -0,0 +1,7 @@
+import numpy as _np
+
+
+def normal(std):
+    def init(shape, fan):
+        return std*_np.random.randn(*shape)
+    return init
diff --git a/beacon8/init/Ortho.py b/beacon8/init/Ortho.py
@@ -0,0 +1,39 @@
+import numpy as _np
+
+def ortho_qr(gain=_np.sqrt(2)):
+    # gain should be set based on the activation function:
+    # linear activations     g = 1 (or greater)
+    # tanh activations       g > 1
+    # ReLU activations       g = sqrt(2) (or greater)
+
+    def init(shape, **_):
+        # Note that this is not strictly correct.
+        #
+        # What we'd really want is for an initialization which reuses ortho
+        # matrices across layers, but we can't have that with the current arch:
+        #
+        # From A. Saxe's comment in https://plus.google.com/+SoumithChintala/posts/RZfdrRQWL6u
+        # > This initialization uses orthogonal matrices, but there’s a bit of
+        # > subtlety when it comes to undercomplete layers—basically you need to
+        # > make sure that the paths from the input layer to output layer, through
+        # > the bottleneck, are preserved. This is accomplished by reusing parts of
+        # > the same orthogonal matrices across different layers of the network.
+        flat = (shape[0], _np.prod(shape[1:]))
+        q1, _ = _np.linalg.qr(_np.random.randn(flat[0], flat[0]))
+        q2, _ = _np.linalg.qr(_np.random.randn(flat[1], flat[1]))
+        w = _np.dot(q1[:,:min(flat)], q2[:min(flat),:])
+        return gain * w.reshape(shape)
+    return init
+
+def ortho_svd(gain=_np.sqrt(2)):
+    # gain should be set based on the activation function:
+    # linear activations     g = 1 (or greater)
+    # tanh activations       g > 1
+    # ReLU activations       g = sqrt(2) (or greater)
+
+    def init(shape, **_):
+        flat = (shape[0], _np.prod(shape[1:]))
+        u, _, v = _np.linalg.svd(_np.random.randn(*flat), full_matrices=False)
+        w = u if u.shape == flat else v
+        return gain * w.reshape(shape)
+    return init
diff --git a/beacon8/init/PReLU.py b/beacon8/init/PReLU.py
@@ -0,0 +1,8 @@
+import numpy as _np
+from beacon8.init import xavier, xavierN
+
+def prelu(gain=1):
+    return xavier(gain * _np.sqrt(2))
+
+def preluN(gain=1):
+    return xavierN(gain * _np.sqrt(2))
diff --git a/beacon8/init/Uniform.py b/beacon8/init/Uniform.py
@@ -0,0 +1,7 @@
+import numpy as _np
+
+
+def uniform(low, high):
+    def init(shape, fan):
+        return _np.random.uniform(low=low, high=high, size=shape)
+    return init
diff --git a/beacon8/init/Xavier.py b/beacon8/init/Xavier.py
@@ -1,7 +1,24 @@
 import numpy as _np
 
-def xavier(shape, fan):
-    assert fan is not None, "The parameter's `fan` needs to be specified when using Xavier initialization."
+def xavier(gain=1):
+    def init(shape, fan):
+        assert fan is not None, "The parameter's `fan` needs to be specified when using Xavier initialization."
 
-    w_bound = _np.sqrt(4. / sum(fan))
-    return _np.random.uniform(low=-w_bound, high=w_bound, size=shape)
+        fan_mean = _np.mean(fan)
+        bound = _np.sqrt(3./fan_mean)
+        return _np.random.uniform(low=-bound, high=bound, size=shape)
+    return init
+
+def xavierN(gain=1):
+    def init(shape, fan):
+        assert fan is not None, "The parameter's `fan` needs to be specified when using Xavier initialization."
+
+        fan_mean = _np.mean(fan)
+        return _np.sqrt(1./fan_mean) * _np.random.randn(*shape)
+    return init
+
+def xavierSigm(gain=1):
+    return xavier(gain * _np.sqrt(2))
+
+def xavierSigmN(gain=1):
+    return xavierN(gain * _np.sqrt(2))
diff --git a/beacon8/init/__init__.py b/beacon8/init/__init__.py
@@ -1,2 +1,6 @@
-from .Const import const, zero, one
-from .Xavier import xavier
+from .Const import const
+from .Xavier import xavier, xavierN, xavierSigm, xavierSigmN
+from .PReLU import prelu, preluN
+from .Normal import normal
+from .Uniform import uniform
+from .Ortho import ortho_qr, ortho_svd
diff --git a/beacon8/layers/BatchNormalization.py b/beacon8/layers/BatchNormalization.py
@@ -1,5 +1,5 @@
 from .Module import Module
-from beacon8.init import zero, one
+from beacon8.init import const
 from beacon8.utils import create_param, create_param_and_grad
 
 import numpy as _np
@@ -11,15 +11,15 @@ class BatchNormalization(Module):
     def __init__(self, n_features, eps=None):
         Module.__init__(self)
 
-        self.weight, self.grad_weight = create_param_and_grad(n_features, one, 'W_BN')
-        self.bias, self.grad_bias = create_param_and_grad(n_features, zero, 'b_BN')
+        self.weight, self.grad_weight = create_param_and_grad(n_features, const(1), 'W_BN')
+        self.bias, self.grad_bias = create_param_and_grad(n_features, const(0), 'b_BN')
 
-        self.inference_weight = create_param(n_features, one, 'W_BN_inf')
-        self.inference_bias = create_param(n_features, zero, 'b_BN_inf')
+        self.inference_weight = create_param(n_features, const(1), 'W_BN_inf')
+        self.inference_bias = create_param(n_features, const(0), 'b_BN_inf')
 
         # These are buffers for collecting the minibatch statistics.
-        self.buffer_variance = create_param(n_features, one, 'BN_var')
-        self.buffer_mean = create_param(n_features, zero, 'BN_mean')
+        self.buffer_variance = create_param(n_features, const(1), 'BN_var')
+        self.buffer_mean = create_param(n_features, const(0), 'BN_mean')
         self.buffer_counts = _th.shared(_np.asarray(0, dtype=_th.config.floatX))
 
         self.eps = eps or 1e-5
diff --git a/beacon8/layers/Linear.py b/beacon8/layers/Linear.py
@@ -1,5 +1,5 @@
 from .Module import Module
-from beacon8.init import zero, xavier
+from beacon8.init import const, xavier
 from beacon8.utils import create_param_and_grad
 
 import numpy as _np
@@ -8,16 +8,16 @@
 
 class Linear(Module):
 
-    def __init__(self, nin, nout, init=xavier, with_bias=True, init_b=zero):
+    def __init__(self, nin, nout, with_bias=True, initW=xavier(), initB=const(0)):
         Module.__init__(self)
 
         self.nin = nin
         self.nout = nout
         self.with_bias = with_bias
 
-        self.weight, self.grad_weight = create_param_and_grad((nin, nout), init, fan=(nin, nout), name='Wlin_{}x{}'.format(nin, nout))
+        self.weight, self.grad_weight = create_param_and_grad((nin, nout), initW, fan=(nin, nout), name='Wlin_{}x{}'.format(nin, nout))
         if self.with_bias:
-            self.bias, self.grad_bias = create_param_and_grad(nout, init_b, name='blin_{}'.format(nout))
+            self.bias, self.grad_bias = create_param_and_grad(nout, initB, name='blin_{}'.format(nout))
 
     def symb_forward(self, symb_input):
         out = _th.tensor.dot(symb_input, self.weight)
diff --git a/beacon8/layers/SpatialConvolution.py b/beacon8/layers/SpatialConvolution.py
@@ -1,13 +1,13 @@
 from .Module import Module
-from beacon8.init import zero, xavier
+from beacon8.init import const, xavier
 from beacon8.utils import create_param_and_grad
 
 import theano as _th
 import numpy as _np
 
 
 class SpatialConvolution(Module):
-    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_bias=True, border_mode='valid', imshape=None, init=xavier, init_b=zero):
+    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_bias=True, initW=xavier(), initB=const(0), border_mode='valid', imshape=None):
         Module.__init__(self)
         self.n_input_plane = n_input_plane
         self.n_output_plane = n_output_plane
@@ -22,9 +22,9 @@ def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_b
         self.w_shape = (n_output_plane, n_input_plane, k_h, k_w)
         w_fan = (n_input_plane*k_w*k_h, n_output_plane*k_w*k_h)
 
-        self.weight, self.grad_weight = create_param_and_grad(self.w_shape, init, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
+        self.weight, self.grad_weight = create_param_and_grad(self.w_shape, initW, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
         if self.with_bias:
-            self.bias, self.grad_bias = create_param_and_grad(n_output_plane, init_b, name='bconv_{}'.format(n_output_plane))
+            self.bias, self.grad_bias = create_param_and_grad(n_output_plane, initB, name='bconv_{}'.format(n_output_plane))
 
     def symb_forward(self, symb_input):
         conv_output = _th.tensor.nnet.conv.conv2d(symb_input, self.weight,
diff --git a/beacon8/layers/SpatialConvolutionCUDNN.py b/beacon8/layers/SpatialConvolutionCUDNN.py
@@ -1,5 +1,5 @@
 from .Module import Module
-from beacon8.init import zero, xavier
+from beacon8.init import const, xavier
 from beacon8.utils import create_param_and_grad
 
 import theano as _th
@@ -8,7 +8,7 @@
 
 
 class SpatialConvolutionCUDNN(Module):
-    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, pad_w=0, pad_h=0, with_bias=True, init=xavier, init_b=zero):
+    def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, pad_w=0, pad_h=0, with_bias=True, initW=xavier(), initB=const(0)):
         Module.__init__(self)
         self.n_input_plane = n_input_plane
         self.n_output_plane = n_output_plane
@@ -23,9 +23,9 @@ def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, pad_w=
         w_shape = (n_output_plane, n_input_plane, k_h, k_w)
         w_fan = (n_input_plane*k_w*k_h, n_output_plane*k_w*k_h)
 
-        self.weight, self.grad_weight = create_param_and_grad(w_shape, init, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
+        self.weight, self.grad_weight = create_param_and_grad(w_shape, initW, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
         if self.with_bias:
-            self.bias, self.grad_bias = create_param_and_grad(n_output_plane, init_b, name='bconv_{}'.format(n_output_plane))
+            self.bias, self.grad_bias = create_param_and_grad(n_output_plane, initB, name='bconv_{}'.format(n_output_plane))
 
     def symb_forward(self, symb_input):
         conv_output = _dnn.dnn_conv(img=symb_input,