Merge pull request #4 from ikostrikov/otto

lucasb-eyer · lucasb-eyer · commit 9c228ca6af72 · 2015-05-22T13:42:42.000+02:00
Added Otto example and many layers
diff --git a/beacon8/__init__.py b/beacon8/__init__.py
@@ -1,3 +1,3 @@
 from .layers import *
 from .containers import *
-from .criteria import *
+from .criteria import *
diff --git a/beacon8/containers/Concat.py b/beacon8/containers/Concat.py
@@ -0,0 +1,13 @@
+from .Container import Container
+
+import theano.tensor as _T
+
+
+class Concat(Container):
+    def __init__(self, axis=1):
+        Container.__init__(self)
+        self.axis = axis
+
+    def symb_forward(self, symb_input):
+        symb_outputs = [module.symb_forward(symb_input) for module in self.modules]
+        return _T.concatenate(symb_outputs, self.axis)
diff --git a/beacon8/containers/Container.py b/beacon8/containers/Container.py
@@ -30,5 +30,11 @@ def parameters(self):
 
         return params, grads
 
+    def get_stat_updates(self):
+        stat_updates = []
+        for module in self.modules:
+            stat_updates += module.get_stat_updates()
+        return stat_updates
+
     def add(self, module):
         self.modules.append(module)
diff --git a/beacon8/containers/__init__.py b/beacon8/containers/__init__.py
@@ -1,2 +1,3 @@
 from .Container import *
-from .Sequential import *
+from .Sequential import *
+from .Concat import *
diff --git a/beacon8/layers/AddConstant.py b/beacon8/layers/AddConstant.py
@@ -0,0 +1,9 @@
+from .Module import Module
+
+
+class AddConstant(Module):
+    def __init__(self, scalar):
+        self.scalar = scalar
+
+    def symb_forward(self, symb_input):
+        return symb_input + self.scalar
diff --git a/beacon8/layers/BatchNormalization.py b/beacon8/layers/BatchNormalization.py
@@ -0,0 +1,69 @@
+from .Module import Module
+
+import numpy as _np
+import theano as _th
+import theano.tensor as _T
+
+
+class BatchNormalization(Module):
+    def __init__(self, n_features, eps=None):
+        Module.__init__(self)
+
+        self.weight = _th.shared(_np.ones(shape=(n_features,), dtype=_th.config.floatX))
+        self.bias = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
+        self.grad_weight = _th.shared(_np.zeros(shape=(n_features,), dtype=_th.config.floatX))
+        self.grad_bias = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
+
+        self.inference_weight = _th.shared(_np.ones(shape=(n_features,), dtype=_th.config.floatX))
+        self.inference_bias = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
+
+        self.buffer_variance = _th.shared(_np.ones(shape=(n_features, ), dtype=_th.config.floatX))
+        self.buffer_mean = _th.shared(_np.zeros(shape=(n_features, ), dtype=_th.config.floatX))
+        self.buffer_counts = _th.shared(_np.asarray(0., dtype=_th.config.floatX))
+
+        self.eps = eps or 1e-5
+
+        self.batch_mean = None
+        self.batch_var = None
+
+    def symb_forward(self, symb_input):
+        d_shuffle = ('x', 0)
+        axis = (0,)
+
+        if symb_input.ndim == 4:
+            d_shuffle += ('x', 'x')
+            axis += (2, 3)
+
+        if self.training_mode:
+            self.batch_mean = _th.tensor.mean(symb_input, axis=axis)
+            self.batch_var = _th.tensor.var(symb_input, axis=axis)
+
+            return (symb_input - self.batch_mean.dimshuffle(*d_shuffle)) / _th.tensor.sqrt(self.batch_var + self.eps).dimshuffle(*d_shuffle) * self.weight.dimshuffle(*d_shuffle) + self.bias.dimshuffle(*d_shuffle)
+        else:
+            return symb_input * self.inference_weight.dimshuffle(*d_shuffle) + self.inference_bias.dimshuffle(*d_shuffle)
+
+    def get_stat_updates(self,):
+        assert (self.batch_mean is not None) and (self.batch_var is not None), "You need to do a forward pass first"
+
+        stat_updates = list()
+        stat_updates.append((self.buffer_mean,
+                             (self.buffer_mean * self.buffer_counts + self.batch_mean) / (self.buffer_counts + 1.0)))
+
+        stat_updates.append((self.buffer_variance,
+                             (self.buffer_variance * self.buffer_counts + self.batch_var) / (self.buffer_counts + 1.0)))
+
+        stat_updates.append((self.buffer_counts,
+                             self.buffer_counts + 1.0))
+
+        return stat_updates
+
+    def training(self):
+        Module.training(self)
+        self.buffer_counts.set_value(0)
+        self.batch_mean = None
+        self.batch_var = None
+
+    def evaluate(self):
+        Module.evaluate(self)
+        self.inference_weight.set_value(self.weight.get_value() / _np.sqrt(self.buffer_variance.get_value() + self.eps))
+        self.inference_bias.set_value(self.bias.get_value() - self.inference_weight.get_value() * self.buffer_mean.get_value())
diff --git a/beacon8/layers/Dropout.py b/beacon8/layers/Dropout.py
@@ -0,0 +1,25 @@
+from .Module import Module
+
+import theano as _th
+from theano.sandbox.rng_mrg import MRG_RandomStreams as RandomStreams
+_srng = RandomStreams()
+
+class Dropout(Module):
+    def __init__(self, dropout):
+        Module.__init__(self)
+        self.dropout = dropout
+
+    def symb_forward(self, symb_input):
+        if self.training_mode:
+            shuffle_shape = (0, 1)
+            if symb_input.ndim == 4:
+                shuffle_shape += ('x', 'x')
+
+            mask = _srng.binomial((symb_input.shape[0], symb_input.shape[1]),
+                                  p=(1. - self.dropout),
+                                  dtype='int32'
+                                  ).astype(_th.config.floatX).dimshuffle(*shuffle_shape)
+
+            return symb_input / (1. - self.dropout) * mask
+        else:
+            return symb_input
diff --git a/beacon8/layers/Linear.py b/beacon8/layers/Linear.py
@@ -38,3 +38,4 @@ def symb_forward(self, symb_input):
             out += self.bias
 
         return out
+
diff --git a/beacon8/layers/Log.py b/beacon8/layers/Log.py
@@ -0,0 +1,8 @@
+from .Module import Module
+
+import theano.tensor as _T
+
+
+class Log(Module):
+    def symb_forward(self, symb_input):
+        return _T.log(symb_input)
diff --git a/beacon8/layers/Module.py b/beacon8/layers/Module.py
@@ -9,6 +9,7 @@ def __init__(self):
 
         self.fn_forward = None
         self.fn_accum_grads = None
+        self.fn_accum_stats = None
 
     def reset(self):
         pass
@@ -70,3 +71,20 @@ def accumulate_gradients(self, data_in, data_tgt, loss):
             )
 
         self.fn_accum_grads(data_in, data_tgt)
+
+    def get_stat_updates(self):
+        return []
+
+    def accumulate_statistics(self, data_in):
+        if self.fn_accum_stats is None:
+            symb_in = _T.TensorType(_th.config.floatX, (False,) * data_in.ndim)('X')
+            self.symb_forward(symb_in)
+
+            stat_updates = self.get_stat_updates()
+
+            self.fn_accum_stats = _th.function(
+                inputs=[symb_in],
+                updates=stat_updates
+            )
+
+        self.fn_accum_stats(data_in)
diff --git a/beacon8/layers/Reshape.py b/beacon8/layers/Reshape.py
@@ -0,0 +1,10 @@
+from .Module import Module
+
+
+class Reshape(Module):
+    def __init__(self, *new_shape):
+        Module.__init__(self)
+        self.new_shape = new_shape
+
+    def symb_forward(self, symb_input):
+        return symb_input.reshape(self.new_shape)
diff --git a/beacon8/layers/__init__.py b/beacon8/layers/__init__.py
@@ -2,4 +2,9 @@
 from .Softmax import *
 from .Module import *
 from .Tanh import *
-from .ReLU import *
+from .ReLU import *
+from .BatchNormalization import *
+from .Dropout import *
+from .AddConstant import *
+from .Log import *
+from .Reshape import *
diff --git a/beacon8/utils.py b/beacon8/utils.py
@@ -1,7 +1,7 @@
 import theano as _th
 
-def create_param_state_as(self, other, initial_value=0):
+def create_param_state_as(other, initial_value=0):
     return _th.shared(other.get_value()*0 + initial_value,
         broadcastable=other.broadcastable,
-        name='state_for_' + other.name
+        name='state_for_' + str(other.name)
     )
diff --git a/examples/Kaggle-Otto/kaggle_utils.py b/examples/Kaggle-Otto/kaggle_utils.py
@@ -0,0 +1,29 @@
+import numpy as np
+
+def multiclass_log_loss(y_true, y_pred, normalize=True, eps=1e-15):
+    """Multi class version of Logarithmic Loss metric.
+    https://www.kaggle.com/wiki/MultiClassLogLoss
+
+    Parameters
+    ----------
+    y_true : array, shape = [n_samples]
+            true class, intergers in [0, n_classes - 1)
+    y_pred : array, shape = [n_samples, n_classes]
+
+    Returns
+    -------
+    loss : float
+    """
+    predictions = np.clip(y_pred, eps, 1 - eps)
+
+    # normalize row sums to 1
+    predictions /= predictions.sum(axis=1)[:, np.newaxis]
+
+    actual = np.zeros(y_pred.shape)
+    n_samples = actual.shape[0]
+    actual[np.arange(n_samples), y_true.astype(int)] = 1
+    vectsum = np.sum(actual * np.log(predictions))
+    loss = -1.0 * vectsum
+    if normalize:
+        loss /= n_samples
+    return loss
diff --git a/examples/Kaggle-Otto/progress_bar.py b/examples/Kaggle-Otto/progress_bar.py
@@ -0,0 +1,7 @@
+from progressbar import ProgressBar, Counter, Percentage, Bar, ETA
+
+
+def make_progressbar(mode, epoch, data_size):
+    widgets = [mode + ' epoch #', str(epoch), ', processed ', Counter(), ' of ', str(data_size),
+               ' (', Percentage(), ')', ' ', Bar(), ' ', ETA()]
+    return ProgressBar(maxval=data_size, widgets=widgets)
diff --git a/examples/Kaggle-Otto/run.py b/examples/Kaggle-Otto/run.py
@@ -0,0 +1,64 @@
+import numpy as np
+import pandas as pd
+import beacon8 as bb8
+import beacon8.optimizers as optim
+from sklearn.preprocessing import LabelEncoder
+from sklearn.cross_validation import train_test_split
+from train import *
+from test import *
+
+
+def load_train_data():
+    train_data = pd.read_csv('./data/train.csv')
+    labels = train_data.target.values
+    labels_encoder = LabelEncoder()
+    labels = labels_encoder.fit_transform(labels)
+    train_data = train_data.drop('id', axis=1)
+    train_data = train_data.drop('target', axis=1)
+    return train_data.as_matrix(), labels
+
+def nnet():
+    model = bb8.Sequential()
+    model.add(bb8.AddConstant(1.0))
+    model.add(bb8.Log())
+    model.add(bb8.BatchNormalization(93))
+    model.add(bb8.Dropout(0.1))
+    model.add(bb8.Linear(93, 512))
+    model.add(bb8.BatchNormalization(512))
+    model.add(bb8.ReLU())
+    model.add(bb8.Dropout(0.5))
+
+    model.add(bb8.Linear(512, 512))
+    model.add(bb8.BatchNormalization(512))
+    model.add(bb8.ReLU())
+    model.add(bb8.Dropout(0.5))
+
+    model.add(bb8.Linear(512, 512))
+    model.add(bb8.BatchNormalization(512))
+    model.add(bb8.ReLU())
+    model.add(bb8.Dropout(0.5))
+
+    model.add(bb8.Linear(512, 9))
+    model.add(bb8.SoftMax())
+    return model
+
+if __name__ == "__main__":
+    train_data_x, train_data_y = load_train_data()
+
+    train_data_x, test_data_x, train_data_y, test_data_y = train_test_split(train_data_x, train_data_y, train_size=0.85)
+    model = nnet()
+
+    criterion = bb8.ClassNLLCriterion()
+
+    optimiser = optim.Momentum(lr=0.01, momentum=0.9)
+
+    for epoch in range(1000):
+        model.training()
+        if epoch > 100 and epoch % 100 == 0:
+            optimiser.hyperparams['lr'] /= 10
+        train(train_data_x, train_data_y, model, optimiser, criterion, epoch, 100)
+        train(train_data_x, train_data_y, model, optimiser, criterion, epoch, 100, 'stat')
+
+        model.evaluate()
+        validate(test_data_x, test_data_y, model, epoch, 100)
+
diff --git a/examples/Kaggle-Otto/test.py b/examples/Kaggle-Otto/test.py
@@ -0,0 +1,31 @@
+import numpy as np
+from progress_bar import *
+import theano as _th
+from sklearn.metrics import log_loss
+from kaggle_utils import *
+
+def validate(dataset_x, dataset_y, model, epoch, batch_size):
+    progress = make_progressbar('Testing', epoch, len(dataset_x))
+    progress.start()
+
+    mini_batch_input = np.empty(shape=(batch_size, 93), dtype=_th.config.floatX)
+    mini_batch_targets = np.empty(shape=(batch_size, ), dtype=_th.config.floatX)
+    accuracy = 0.
+
+    for j in range((dataset_x.shape[0] + batch_size - 1) // batch_size):
+        progress.update(j * batch_size)
+        for k in range(batch_size):
+            if j * batch_size + k < dataset_x.shape[0]:
+                mini_batch_input[k] = dataset_x[j * batch_size + k]
+                mini_batch_targets[k] = dataset_y[j * batch_size + k]
+
+        mini_batch_prediction = model.forward(mini_batch_input)
+
+        if (j + 1) * batch_size > dataset_x.shape[0]:
+            mini_batch_prediction.resize((dataset_x.shape[0] - j * batch_size, 9))
+            mini_batch_targets.resize((dataset_x.shape[0] - j * batch_size, ))
+
+        accuracy = accuracy + multiclass_log_loss(mini_batch_targets, mini_batch_prediction, normalize=False)
+
+    progress.finish()
+    print("Epoch #" + str(epoch) + ", Logloss: " + str(float(accuracy) / dataset_x.shape[0]))
diff --git a/examples/Kaggle-Otto/train.py b/examples/Kaggle-Otto/train.py
@@ -0,0 +1,29 @@
+import numpy as np
+from progress_bar import *
+import theano as _th
+
+
+def train(dataset_x, dataset_y, model, optimiser, criterion, epoch, batch_size, mode=None):
+    progress = make_progressbar('Training', epoch, len(dataset_x))
+    progress.start()
+
+    shuffle = np.random.permutation(len(dataset_x))
+
+    mini_batch_input = np.empty(shape=(batch_size, 93), dtype=_th.config.floatX)
+    mini_batch_targets = np.empty(shape=(batch_size, ), dtype=_th.config.floatX)
+
+    for j in range(dataset_x.shape[0] // batch_size):
+        for k in range(batch_size):
+            mini_batch_input[k] = dataset_x[shuffle[j * batch_size + k]]
+            mini_batch_targets[k] = dataset_y[shuffle[j * batch_size + k]]
+
+        if mode is None:
+            model.zero_grad_parameters()
+            model.accumulate_gradients(mini_batch_input, mini_batch_targets, criterion)
+            optimiser.update_parameters(model)
+        else:
+            model.accumulate_statistics(mini_batch_input)
+
+        progress.update(j * batch_size)
+
+    progress.finish()

Original file line number	Diff line number	Diff line change
`@@ -38,3 +38,4 @@ def symb_forward(self, symb_input):`
`38`	`38`	`out += self.bias`
`39`	`39`
`40`	`40`	`return out`
	`41`	`+`