lucasb-eyer
diff --git a/‎DeepFried2/Container.py‎
Lines changed: 10 additions & 15 deletions b/‎DeepFried2/Container.py‎
Lines changed: 10 additions & 15 deletions
diff --git a/‎DeepFried2/Module.py‎
Lines changed: 22 additions & 43 deletions b/‎DeepFried2/Module.py‎
Lines changed: 22 additions & 43 deletions
diff --git a/‎DeepFried2/Optimizer.py‎
Lines changed: 1 addition & 1 deletion b/‎DeepFried2/Optimizer.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎DeepFried2/Param.py‎
Lines changed: 38 additions & 0 deletions b/‎DeepFried2/Param.py‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎DeepFried2/__init__.py‎
Lines changed: 2 additions & 0 deletions b/‎DeepFried2/__init__.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎DeepFried2/criteria/WeightDecay.py‎
Lines changed: 8 additions & 16 deletions b/‎DeepFried2/criteria/WeightDecay.py‎
Lines changed: 8 additions & 16 deletions
diff --git a/‎DeepFried2/layers/BackwardsConvolutionCUDNN.py‎
Lines changed: 11 additions & 10 deletions b/‎DeepFried2/layers/BackwardsConvolutionCUDNN.py‎
Lines changed: 11 additions & 10 deletions
diff --git a/‎DeepFried2/layers/BatchNormalization.py‎
Lines changed: 27 additions & 27 deletions b/‎DeepFried2/layers/BatchNormalization.py‎
Lines changed: 27 additions & 27 deletions
@@ -1,4 +1,6 @@
 import DeepFried2 as df
+from collections import OrderedDict as _OrderedDict
+from itertools import chain as _chain
 
 
 class Container(df.Module):
@@ -19,24 +21,17 @@ def training(self):
         for module in self.modules:
             module.training()
 
-    def parameters(self):
-        params, grads = [], []
+    def parameters(self, *a, **kw):
+        params = _chain.from_iterable(m.parameters(*a, **kw) for m in self.modules)
 
-        for module in self.modules:
-            mod_params, mod_grads = module.parameters()
-            params += mod_params
-            grads += mod_grads
-
-        return params, grads
-
-    def may_decay(self):
-        return sum((m.may_decay() for m in self.modules), [])
+        # We actually need to remove duplicates from the list of parameters
+        # (and their corresponding gradients) in order to support reusing
+        # the same layer at multiple places in the graph,
+        # e.g. do weight sharing.
+        return list(_OrderedDict.fromkeys(params).keys())
 
     def get_stat_updates(self):
-        stat_updates = []
-        for module in self.modules:
-            stat_updates += module.get_stat_updates()
-        return stat_updates
+        return _chain.from_iterable(m.get_stat_updates() for m in self.modules)
 
     def add(self, *modules):
         for m in modules:
 
@@ -1,7 +1,6 @@
 import DeepFried2 as df
 from DeepFried2.utils import make_tensor_or_tensors, aslist
 
-from collections import OrderedDict as _OrderedDict
 import numpy as _np
 
 class Module:
@@ -19,44 +18,24 @@ def __init__(self):
     #def __hash__(self):
     #    raise NotImplementedError("You *need* to reimplement hash, even if it's just python's default. See the documentation for more info.")
 
+    def _addparam(self, *a, **kw):
+        # Add it here because many don't even have params. This avoids misuse.
+        if not hasattr(self, '_params'):
+            self._params = []
+
+        param = df.Param(*a, **kw)
+        self._params.append(param)
+        return param
+
     def zero_grad_parameters(self):
-        _, grads = self.unique_parameters()  # Here, it's just a matter of performance. But even then, not really.
-        for grad in grads:
-            grad.set_value(_np.zeros_like(grad.get_value()))
-
-    def parameters(self):
-        params, grads = [], []
-
-        if hasattr(self, 'weight'):
-            assert hasattr(self, 'grad_weight'), "The layer {} has a `weight` variable but no `grad_weight`, you probably forget to implement it.".format(df.utils.typename(self))
-            params += [self.weight]
-            grads += [self.grad_weight]
-
-        if hasattr(self, 'bias'):
-            assert hasattr(self, 'grad_bias'), "The layer {} has a `bias` variable but no `grad_bias`, you probably forget to implement it.".format(df.utils.typename(self))
-            params += [self.bias]
-            grads += [self.grad_bias]
-
-        return params, grads
-
-    def unique_parameters(self):
-        # We actually need to remove duplicates from the list of parameters
-        # (and their corresponding gradients) in order to support reusing
-        # the same layer at multiple places in the graph,
-        # e.g. do weight sharing.
-        params, grads = self.parameters()
-        return (
-            list(_OrderedDict.fromkeys(params).keys()),
-            list(_OrderedDict.fromkeys(grads).keys()),
-        )
-
-    def may_decay(self):
-        flags = []
-        if hasattr(self, 'weight'):
-            flags += [True]
-        if hasattr(self, 'bias'):
-            flags += [False]
-        return flags
+        for p in self.parameters(trainable_only=True):
+            p.zero_grad()
+
+    def parameters(self, trainable_only=False):
+        params = getattr(self, '_params', [])
+        if trainable_only:
+            params = [p for p in params if p.trainable()]
+        return params
 
     def evaluate(self):
         self.training_mode = False
@@ -85,10 +64,10 @@ def accumulate_gradients(self, data_in, data_tgt, loss):
             symb_out = self.symb_forward(symb_in)
             symb_err = loss.full_symb_forward(symb_out, symb_tgt)
 
-            params, grads = self.unique_parameters()
-            symb_grads = df.th.grad(cost=symb_err, wrt=params)
+            params = self.parameters(trainable_only=True)
+            symb_grads = df.th.grad(cost=symb_err, wrt=[p.param for p in params])
+            grads_updates = [(p.grad, p.grad + symb_grad) for p, symb_grad in zip(params, symb_grads)]
 
-            grads_updates = [(grad, grad + symb_grad) for grad, symb_grad in zip(grads, symb_grads)]
             self._fn_accum_grads[self.training_mode] = df.th.function(
                 inputs=aslist(symb_in) + aslist(symb_tgt),
                 outputs=symb_err,
@@ -151,8 +130,8 @@ def clear(self):
         self._fn_accum_stats.clear()
 
     def __getstate__(self):
-        return [p.get_value() for p in self.unique_parameters()[0]]
+        return [p.get_value() for p in self.parameters()]
 
     def __setstate__(self, state):
-        for p, s in zip(self.unique_parameters()[0], state):
+        for p, s in zip(self.parameters(), state):
             p.set_value(s)
@@ -10,9 +10,9 @@ def __init__(self, **hyperparams):
     def update_parameters(self, model):
 
         if model not in self.states:
-            params, grads = model.unique_parameters()
             # TODO: Not only scalar, e.g. Adam might profit from integer t
             hyperparams = {name: df.T.scalar(name) for name in self.hyperparams}
+            params, grads = zip(*[(p.param, p.grad) for p in model.parameters(trainable_only=True)])
             updates = self.get_updates(params, grads, **hyperparams)
             self.states[model] = df.th.function(
                 inputs=list(hyperparams.values()),
 
@@ -0,0 +1,38 @@
+import DeepFried2 as df
+import numpy as _np
+
+
+class Param:
+
+    def __init__(self, shape, init, fan=None, name=None, learn=True, decay=True, dtype=df.floatX, **kw):
+        self.init = init
+        self.shape = shape
+        self.fan = fan
+        self.decay = decay
+
+        val = init(self.shape, self.fan).astype(dtype)
+        self.param = df.th.shared(val, name=name, **kw)
+
+        if learn:
+            grad_name = 'grad_' + name if name is not None else None
+            self.grad = df.th.shared(_np.zeros_like(val), name=grad_name, **kw)
+        else:
+            self.grad = None
+
+    def get_value(self):
+        return self.param.get_value()
+
+    def set_value(self, val):
+        self.param.set_value(val)
+
+    def reinit(self):
+        self.param.set_value(self.init(self.shape, self.fan).astype(self.param.dtype))
+
+    def zero_grad(self):
+        self.grad.set_value(_np.zeros(self.shape, self.param.dtype))
+
+    def may_decay(self):
+        return self.grad is not None and self.decay
+
+    def trainable(self):
+        return self.grad is not None
@@ -4,6 +4,8 @@
 
 import DeepFried2.init as init
 
+from .Param import Param
+
 from .Module import Module
 from .layers import *
 
 
@@ -5,28 +5,20 @@
 
 
 class L1WeightDecay:
-    def __init__(self, *containers):
-        self.containers = containers
+    def __init__(self, *modules):
+        self.modules = modules
 
     def symb_forward(self):
-        return sum(df.T.sum(abs(p)) for p in collect_decayable_params(*self.containers))
+        return sum(df.T.sum(abs(p)) for p in _collect_decayable_params(self.modules))
 
 
 class L2WeightDecay:
-    def __init__(self, *containers):
-        self.containers = containers
+    def __init__(self, *modules):
+        self.modules = modules
 
     def symb_forward(self):
-        return sum(df.T.sum(p**2) for p in collect_decayable_params(*self.containers))
+        return sum(df.T.sum(p**2) for p in _collect_decayable_params(self.modules))
 
 
-def collect_decayable_params(*containers):
-    decay_params = []
-    for c in containers:
-        params, _ = c.unique_parameters()  # TODO: unique or non-unique?
-        may = c.may_decay()
-
-        assert len(params) == len(may), "Possible implementation bug in `{}.may_decay()`: {} parameters, but {} decay infos.".format(df.utils.typename(c), len(params), len(may))
-
-        decay_params += [p for p,m in zip(params, may) if may]
-    return decay_params
+def _collect_decayable_params(modules):
+    return [p.param for c in modules for p in c.parameters() if p.may_decay()]
@@ -1,13 +1,13 @@
 import DeepFried2 as df
-from DeepFried2.utils import create_param_and_grad, expand
+from DeepFried2.utils import expand
 from theano.sandbox.cuda.basic_ops import gpu_contiguous, gpu_alloc_empty
 from theano.sandbox.cuda import dnn
 
 import numpy as np
 
 
 class BackwardsConvolutionCUDNN(df.Module):
-    def __init__(self, nchan_in, nchan_out, filter_size, stride=1, border=0, mode='cross', with_bias=True, initW=df.init.xavier(), initB=df.init.const(0)):
+    def __init__(self, nchan_in, nchan_out, filter_size, stride=1, border=0, mode='cross', init=df.init.xavier(), bias=df.init.const(0)):
         # mode='cross' is the default in Lasagne[1], Torch[2], matConvNet[3], Caffee[4].
         #
         # 1: https://github.com/Lasagne/Lasagne/blob/63d44a0d/lasagne/layers/dnn.py#L299
@@ -19,7 +19,6 @@ def __init__(self, nchan_in, nchan_out, filter_size, stride=1, border=0, mode='c
         self.nchan_out = nchan_out
         self.filter_size = filter_size
         self.mode = mode
-        self.with_bias = with_bias
         self.stride = expand(stride, len(filter_size), 'stride')
         self.border = expand(border, len(filter_size), 'border')
 
@@ -30,27 +29,29 @@ def __init__(self, nchan_in, nchan_out, filter_size, stride=1, border=0, mode='c
 
         w_shape = (nchan_in, nchan_out) + self.filter_size
         w_fan = (np.prod(self.filter_size)*nchan_out, np.prod(self.filter_size)*nchan_in)
+        w_name = ('Wconv_{},{}@{}' + 'x{}'*(len(w_shape) - 3)).format(*w_shape)
+        self.W = self._addparam(w_shape, init, fan=w_fan, name=w_name)
 
-        param_name = 'Wconv_{},{}@{}' + 'x{}'*(len(w_shape) - 3)
-        self.weight, self.grad_weight = create_param_and_grad(w_shape, initW, fan=w_fan, name=param_name.format(*w_shape))
-        if self.with_bias:
-            self.bias, self.grad_bias = create_param_and_grad(nchan_out, initB, name='bconv_{}'.format(nchan_out))
+        if bias not in (None, False):
+            self.b = self._addparam(nchan_out, bias, decay=False, name='bconv_{}'.format(nchan_out))
+        else:
+            self.b = None
 
 
     def symb_forward(self, symb_input):
         """ creates dummy forward conv and uses its gradient as backwards pass """
         """ This code is mostly taken from https://github.com/Newmu/dcgan_code/blob/master/lib/ops.py """
         img = gpu_contiguous(symb_input)
-        kerns = gpu_contiguous(self.weight)
+        kerns = gpu_contiguous(self.W.param)
 
         alloc_shape = (img.shape[0], kerns.shape[1]) + tuple(i*d for i,d in zip(img.shape[2:],self.stride))
         desc = dnn.GpuDnnConvDesc(border_mode=self.border, subsample=self.stride, conv_mode=self.mode)(gpu_alloc_empty(*alloc_shape).shape, kerns.shape)
         out = gpu_alloc_empty(*alloc_shape)
         grad = dnn.GpuDnnConv3dGradI if symb_input.ndim == 5 else dnn.GpuDnnConvGradI
         conv_output = grad()(kerns, img, out, desc)
 
-        if self.with_bias:
+        if self.b is not None:
             d_shuffle = ('x', 0) + tuple('x') * (symb_input.ndim-2)
-            conv_output += self.bias.dimshuffle(*d_shuffle)
+            conv_output += self.b.param.dimshuffle(*d_shuffle)
 
         return conv_output
@@ -1,5 +1,5 @@
 import DeepFried2 as df
-from DeepFried2.utils import create_param, create_param_and_grad, aslist
+from DeepFried2.utils import aslist
 
 import numpy as _np
 
@@ -19,16 +19,16 @@ def __init__(self, n_features, eps=1e-5):
 
         self.shape = tuple(aslist(n_features))
 
-        self.weight, self.grad_weight = create_param_and_grad(n_features, df.init.const(1), name='W_BN_{}'.format(n_features))
-        self.bias, self.grad_bias = create_param_and_grad(n_features, df.init.const(0), name='b_BN_{}'.format(n_features))
+        self.W = self._addparam(n_features, df.init.const(1), name='W_BN_{}'.format(n_features))
+        self.b = self._addparam(n_features, df.init.const(0), name='b_BN_{}'.format(n_features), decay=False)
 
-        self.inference_weight = create_param(n_features, df.init.const(1), name='W_BN_{}_inf'.format(n_features))
-        self.inference_bias = create_param(n_features, df.init.const(0), name='b_BN_{}_inf'.format(n_features))
+        self.Winf = self._addparam(n_features, df.init.const(1), name='W_BN_{}_inf'.format(n_features), learn=False)
+        self.binf = self._addparam(n_features, df.init.const(0), name='b_BN_{}_inf'.format(n_features), learn=False)
 
         # These are buffers for collecting the minibatch statistics.
-        self.buffer_variance = create_param(n_features, df.init.const(1), name='BN_var_{}'.format(n_features))
-        self.buffer_mean = create_param(n_features, df.init.const(0), name='BN_mean_{}'.format(n_features))
-        self.buffer_counts = df.th.shared(_np.asarray(0, dtype=df.floatX), name='BN_count_{}'.format(n_features))
+        self.buf_var = df.th.shared(_np.full(n_features, 1, df.floatX), name='BN_var_{}'.format(n_features))
+        self.buf_mean = df.th.shared(_np.full(n_features, 0, df.floatX), name='BN_mean_{}'.format(n_features))
+        self.buf_count = df.th.shared(_np.asarray(0, dtype=df.floatX), name='BN_count_{}'.format(n_features))
 
         self.eps = eps or 1e-5
 
@@ -46,6 +46,9 @@ def symb_forward(self, symb_input):
 
         # And for the dimshuffle, similar story. Put 'x' on the axes we're normalizing.
         d_shuffle = ['x'] + list(range(len(self.shape))) + ['x']*(symb_input.ndim-len(self.shape)-1)
+        # Shorthand:
+        def dshuf(x):
+            return x.dimshuffle(*d_shuffle)
 
         # For example, for the usual case of images where dimensions are
         # (B,C,H,W), axis == [0, 2, 3] and d_shuffle == ['x', 0, 'x', 'x']
@@ -54,42 +57,39 @@ def symb_forward(self, symb_input):
             self.batch_mean = df.T.mean(symb_input, axis=axis)
             self.batch_var = df.T.var(symb_input, axis=axis)
 
-            return (symb_input - self.batch_mean.dimshuffle(*d_shuffle)) / df.T.sqrt(self.batch_var + self.eps).dimshuffle(*d_shuffle) * self.weight.dimshuffle(*d_shuffle) + self.bias.dimshuffle(*d_shuffle)
+            symb_input = (symb_input - dshuf(self.batch_mean)) / dshuf(df.T.sqrt(self.batch_var + self.eps))
+
+            return symb_input * dshuf(self.W.param) + dshuf(self.b.param)
         else:
-            return symb_input * self.inference_weight.dimshuffle(*d_shuffle) + self.inference_bias.dimshuffle(*d_shuffle)
+            return symb_input * dshuf(self.Winf.param) + dshuf(self.binf.param)
 
     def get_stat_updates(self):
         assert (self.batch_mean is not None) and (self.batch_var is not None), "You need to do a forward pass first"
 
-        stat_updates = list()
-        stat_updates.append((self.buffer_mean,
-                             (self.buffer_mean * self.buffer_counts + self.batch_mean) / (self.buffer_counts + 1.0)))
-
-        stat_updates.append((self.buffer_variance,
-                             (self.buffer_variance * self.buffer_counts + self.batch_var) / (self.buffer_counts + 1.0)))
-
-        stat_updates.append((self.buffer_counts,
-                             self.buffer_counts + 1.0))
-
-        return stat_updates
+        # Update buffer statistics with current batch's statistics.
+        return [
+            (self.buf_mean, (self.buf_mean * self.buf_count + self.batch_mean) / (self.buf_count + 1.0)),
+            (self.buf_var, (self.buf_var * self.buf_count + self.batch_var) / (self.buf_count + 1.0)),
+            (self.buf_count, self.buf_count + 1.0),
+        ]
 
     def training(self):
         df.Module.training(self)
-        self.buffer_counts.set_value(0)
+        self.buf_count.set_value(0)
         self.batch_mean = None
         self.batch_var = None
 
     def evaluate(self):
         df.Module.evaluate(self)
-        self.inference_weight.set_value(self.weight.get_value() / _np.sqrt(self.buffer_variance.get_value() + self.eps))
-        self.inference_bias.set_value(self.bias.get_value() - self.inference_weight.get_value() * self.buffer_mean.get_value())
+        self.Winf.set_value(self.W.get_value() / _np.sqrt(self.buf_var.get_value() + self.eps))
+        self.binf.set_value(self.b.get_value() - self.Winf.get_value() * self.buf_mean.get_value())
 
     def __getstate__(self):
         regular = df.Module.__getstate__(self)
-        return [b.get_value() for b in (self.buffer_mean, self.buffer_variance, self.buffer_counts)] + regular
+        return [buf.get_value() for buf in (self.buf_mean, self.buf_var, self.buf_count)] + regular
 
     def __setstate__(self, state):
         istate = iter(state)
-        for b, s in zip((self.buffer_mean, self.buffer_variance, self.buffer_counts), istate):
-            b.set_value(s)
+        for buf, val in zip((self.buf_mean, self.buf_var, self.buf_count), istate):
+            buf.set_value(val)
         df.Module.__setstate__(self, istate)