Skip to content

Commit ec2a3d8

Browse files
committed
Merge pull request #20 from lucasb-eyer/moar-inits
Moar inits
2 parents 7c91a4f + 4ce0777 commit ec2a3d8

File tree

11 files changed

+107
-28
lines changed

11 files changed

+107
-28
lines changed

beacon8/init/Const.py

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,3 @@ def const(value):
55
def init(shape, fan):
66
return _np.full(shape, value)
77
return init
8-
9-
zero = const(0)
10-
one = const(1)

beacon8/init/Normal.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import numpy as _np
2+
3+
4+
def normal(std):
5+
def init(shape, fan):
6+
return std*_np.random.randn(*shape)
7+
return init

beacon8/init/Ortho.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
import numpy as _np
2+
3+
def ortho_qr(gain=_np.sqrt(2)):
4+
# gain should be set based on the activation function:
5+
# linear activations g = 1 (or greater)
6+
# tanh activations g > 1
7+
# ReLU activations g = sqrt(2) (or greater)
8+
9+
def init(shape, **_):
10+
# Note that this is not strictly correct.
11+
#
12+
# What we'd really want is for an initialization which reuses ortho
13+
# matrices across layers, but we can't have that with the current arch:
14+
#
15+
# From A. Saxe's comment in https://plus.google.com/+SoumithChintala/posts/RZfdrRQWL6u
16+
# > This initialization uses orthogonal matrices, but there’s a bit of
17+
# > subtlety when it comes to undercomplete layers—basically you need to
18+
# > make sure that the paths from the input layer to output layer, through
19+
# > the bottleneck, are preserved. This is accomplished by reusing parts of
20+
# > the same orthogonal matrices across different layers of the network.
21+
flat = (shape[0], _np.prod(shape[1:]))
22+
q1, _ = _np.linalg.qr(_np.random.randn(flat[0], flat[0]))
23+
q2, _ = _np.linalg.qr(_np.random.randn(flat[1], flat[1]))
24+
w = _np.dot(q1[:,:min(flat)], q2[:min(flat),:])
25+
return gain * w.reshape(shape)
26+
return init
27+
28+
def ortho_svd(gain=_np.sqrt(2)):
29+
# gain should be set based on the activation function:
30+
# linear activations g = 1 (or greater)
31+
# tanh activations g > 1
32+
# ReLU activations g = sqrt(2) (or greater)
33+
34+
def init(shape, **_):
35+
flat = (shape[0], _np.prod(shape[1:]))
36+
u, _, v = _np.linalg.svd(_np.random.randn(*flat), full_matrices=False)
37+
w = u if u.shape == flat else v
38+
return gain * w.reshape(shape)
39+
return init

beacon8/init/PReLU.py

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
import numpy as _np
2+
from beacon8.init import xavier, xavierN
3+
4+
def prelu(gain=1):
5+
return xavier(gain * _np.sqrt(2))
6+
7+
def preluN(gain=1):
8+
return xavierN(gain * _np.sqrt(2))

beacon8/init/Uniform.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
import numpy as _np
2+
3+
4+
def uniform(low, high):
5+
def init(shape, fan):
6+
return _np.random.uniform(low=low, high=high, size=shape)
7+
return init

beacon8/init/Xavier.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,24 @@
11
import numpy as _np
22

3-
def xavier(shape, fan):
4-
assert fan is not None, "The parameter's `fan` needs to be specified when using Xavier initialization."
3+
def xavier(gain=1):
4+
def init(shape, fan):
5+
assert fan is not None, "The parameter's `fan` needs to be specified when using Xavier initialization."
56

6-
w_bound = _np.sqrt(4. / sum(fan))
7-
return _np.random.uniform(low=-w_bound, high=w_bound, size=shape)
7+
fan_mean = _np.mean(fan)
8+
bound = _np.sqrt(3./fan_mean)
9+
return _np.random.uniform(low=-bound, high=bound, size=shape)
10+
return init
11+
12+
def xavierN(gain=1):
13+
def init(shape, fan):
14+
assert fan is not None, "The parameter's `fan` needs to be specified when using Xavier initialization."
15+
16+
fan_mean = _np.mean(fan)
17+
return _np.sqrt(1./fan_mean) * _np.random.randn(*shape)
18+
return init
19+
20+
def xavierSigm(gain=1):
21+
return xavier(gain * _np.sqrt(2))
22+
23+
def xavierSigmN(gain=1):
24+
return xavierN(gain * _np.sqrt(2))

beacon8/init/__init__.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,6 @@
1-
from .Const import const, zero, one
2-
from .Xavier import xavier
1+
from .Const import const
2+
from .Xavier import xavier, xavierN, xavierSigm, xavierSigmN
3+
from .PReLU import prelu, preluN
4+
from .Normal import normal
5+
from .Uniform import uniform
6+
from .Ortho import ortho_qr, ortho_svd

beacon8/layers/BatchNormalization.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .Module import Module
2-
from beacon8.init import zero, one
2+
from beacon8.init import const
33
from beacon8.utils import create_param, create_param_and_grad
44

55
import numpy as _np
@@ -11,15 +11,15 @@ class BatchNormalization(Module):
1111
def __init__(self, n_features, eps=None):
1212
Module.__init__(self)
1313

14-
self.weight, self.grad_weight = create_param_and_grad(n_features, one, 'W_BN')
15-
self.bias, self.grad_bias = create_param_and_grad(n_features, zero, 'b_BN')
14+
self.weight, self.grad_weight = create_param_and_grad(n_features, const(1), 'W_BN')
15+
self.bias, self.grad_bias = create_param_and_grad(n_features, const(0), 'b_BN')
1616

17-
self.inference_weight = create_param(n_features, one, 'W_BN_inf')
18-
self.inference_bias = create_param(n_features, zero, 'b_BN_inf')
17+
self.inference_weight = create_param(n_features, const(1), 'W_BN_inf')
18+
self.inference_bias = create_param(n_features, const(0), 'b_BN_inf')
1919

2020
# These are buffers for collecting the minibatch statistics.
21-
self.buffer_variance = create_param(n_features, one, 'BN_var')
22-
self.buffer_mean = create_param(n_features, zero, 'BN_mean')
21+
self.buffer_variance = create_param(n_features, const(1), 'BN_var')
22+
self.buffer_mean = create_param(n_features, const(0), 'BN_mean')
2323
self.buffer_counts = _th.shared(_np.asarray(0, dtype=_th.config.floatX))
2424

2525
self.eps = eps or 1e-5

beacon8/layers/Linear.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
from .Module import Module
2-
from beacon8.init import zero, xavier
2+
from beacon8.init import const, xavier
33
from beacon8.utils import create_param_and_grad
44

55
import numpy as _np
@@ -8,16 +8,16 @@
88

99
class Linear(Module):
1010

11-
def __init__(self, nin, nout, init=xavier, with_bias=True, init_b=zero):
11+
def __init__(self, nin, nout, with_bias=True, initW=xavier(), initB=const(0)):
1212
Module.__init__(self)
1313

1414
self.nin = nin
1515
self.nout = nout
1616
self.with_bias = with_bias
1717

18-
self.weight, self.grad_weight = create_param_and_grad((nin, nout), init, fan=(nin, nout), name='Wlin_{}x{}'.format(nin, nout))
18+
self.weight, self.grad_weight = create_param_and_grad((nin, nout), initW, fan=(nin, nout), name='Wlin_{}x{}'.format(nin, nout))
1919
if self.with_bias:
20-
self.bias, self.grad_bias = create_param_and_grad(nout, init_b, name='blin_{}'.format(nout))
20+
self.bias, self.grad_bias = create_param_and_grad(nout, initB, name='blin_{}'.format(nout))
2121

2222
def symb_forward(self, symb_input):
2323
out = _th.tensor.dot(symb_input, self.weight)

beacon8/layers/SpatialConvolution.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11
from .Module import Module
2-
from beacon8.init import zero, xavier
2+
from beacon8.init import const, xavier
33
from beacon8.utils import create_param_and_grad
44

55
import theano as _th
66
import numpy as _np
77

88

99
class SpatialConvolution(Module):
10-
def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_bias=True, border_mode='valid', imshape=None, init=xavier, init_b=zero):
10+
def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_bias=True, initW=xavier(), initB=const(0), border_mode='valid', imshape=None):
1111
Module.__init__(self)
1212
self.n_input_plane = n_input_plane
1313
self.n_output_plane = n_output_plane
@@ -22,9 +22,9 @@ def __init__(self, n_input_plane, n_output_plane, k_w, k_h, d_w=1, d_h=1, with_b
2222
self.w_shape = (n_output_plane, n_input_plane, k_h, k_w)
2323
w_fan = (n_input_plane*k_w*k_h, n_output_plane*k_w*k_h)
2424

25-
self.weight, self.grad_weight = create_param_and_grad(self.w_shape, init, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
25+
self.weight, self.grad_weight = create_param_and_grad(self.w_shape, initW, fan=w_fan, name='Wconv_{},{}@{}x{}'.format(n_input_plane, n_output_plane, k_w, k_h))
2626
if self.with_bias:
27-
self.bias, self.grad_bias = create_param_and_grad(n_output_plane, init_b, name='bconv_{}'.format(n_output_plane))
27+
self.bias, self.grad_bias = create_param_and_grad(n_output_plane, initB, name='bconv_{}'.format(n_output_plane))
2828

2929
def symb_forward(self, symb_input):
3030
conv_output = _th.tensor.nnet.conv.conv2d(symb_input, self.weight,

0 commit comments

Comments
 (0)