|
| 1 | +import cPickle as pkl |
| 2 | +import time |
| 3 | + |
| 4 | +import numpy |
| 5 | +import theano |
| 6 | +from theano import config |
| 7 | +import theano.tensor as T |
| 8 | +from theano.tensor.nnet import categorical_crossentropy |
| 9 | + |
| 10 | +from fuel.datasets import TextFile |
| 11 | +from fuel.streams import DataStream |
| 12 | +from fuel.schemes import ConstantScheme |
| 13 | +from fuel.transformers import Batch, Padding |
| 14 | + |
| 15 | + |
| 16 | +# These files can be downloaded from |
| 17 | +# http://www-etud.iro.umontreal.ca/~brakelp/train.txt.gz |
| 18 | +# http://www-etud.iro.umontreal.ca/~brakelp/dictionary.pkl |
| 19 | +# don't forget to change the paths and gunzip train.txt.gz |
| 20 | +TRAIN_FILE = '/u/brakelp/temp/traindata.txt' |
| 21 | +VAL_FILE = '/u/brakelp/temp/valdata.txt' |
| 22 | +DICT_FILE = '/u/brakelp/temp/dictionary.pkl' |
| 23 | + |
| 24 | + |
| 25 | +def sequence_categorical_crossentropy(prediction, targets, mask): |
| 26 | + prediction_flat = prediction.reshape(((prediction.shape[0] * |
| 27 | + prediction.shape[1]), |
| 28 | + prediction.shape[2]), ndim=2) |
| 29 | + targets_flat = targets.flatten() |
| 30 | + mask_flat = mask.flatten() |
| 31 | + ce = categorical_crossentropy(prediction_flat, targets_flat) |
| 32 | + return T.sum(ce * mask_flat) |
| 33 | + |
| 34 | + |
| 35 | +def gauss_weight(ndim_in, ndim_out=None, sd=.005): |
| 36 | + if ndim_out is None: |
| 37 | + ndim_out = ndim_in |
| 38 | + W = numpy.random.randn(ndim_in, ndim_out) * sd |
| 39 | + return numpy.asarray(W, dtype=config.floatX) |
| 40 | + |
| 41 | + |
| 42 | +class LogisticRegression(object): |
| 43 | + """Multi-class Logistic Regression Class |
| 44 | +
|
| 45 | + The logistic regression is fully described by a weight matrix :math:`W` |
| 46 | + and bias vector :math:`b`. Classification is done by projecting data |
| 47 | + points onto a set of hyperplanes, the distance to which is used to |
| 48 | + determine a class membership probability. |
| 49 | + """ |
| 50 | + |
| 51 | + def __init__(self, input, n_in, n_out): |
| 52 | + """ Initialize the parameters of the logistic regression |
| 53 | +
|
| 54 | + :type input: theano.tensor.TensorType |
| 55 | + :param input: symbolic variable that describes the input of the |
| 56 | + architecture (one minibatch) |
| 57 | +
|
| 58 | + :type n_in: int |
| 59 | + :param n_in: number of input units, the dimension of the space in |
| 60 | + which the datapoints lie |
| 61 | +
|
| 62 | + :type n_out: int |
| 63 | + :param n_out: number of output units, the dimension of the space in |
| 64 | + which the labels lie |
| 65 | +
|
| 66 | + """ |
| 67 | + |
| 68 | + # initialize with 0 the weights W as a matrix of shape (n_in, n_out) |
| 69 | + self.W = theano.shared(value=numpy.zeros((n_in, n_out), |
| 70 | + dtype=theano.config.floatX), |
| 71 | + name='W', borrow=True) |
| 72 | + # initialize the baises b as a vector of n_out 0s |
| 73 | + self.b = theano.shared(value=numpy.zeros((n_out,), |
| 74 | + dtype=theano.config.floatX), |
| 75 | + name='b', borrow=True) |
| 76 | + |
| 77 | + # compute vector of class-membership probabilities in symbolic form |
| 78 | + energy = T.dot(input, self.W) + self.b |
| 79 | + energy_exp = T.exp(energy - T.max(energy, 2)[:, :, None]) |
| 80 | + pmf = energy_exp / energy_exp.sum(2)[:, :, None] |
| 81 | + self.p_y_given_x = pmf |
| 82 | + |
| 83 | + # compute prediction as class whose probability is maximal in |
| 84 | + # symbolic form |
| 85 | + self.y_pred = T.argmax(self.p_y_given_x, axis=1) |
| 86 | + |
| 87 | + # parameters of the model |
| 88 | + self.params = [self.W, self.b] |
| 89 | + |
| 90 | + |
| 91 | +def index_dot(indices, w): |
| 92 | + return w[indices.flatten()] |
| 93 | + |
| 94 | + |
| 95 | +class LstmLayer: |
| 96 | + |
| 97 | + def __init__(self, rng, input, mask, n_in, n_h): |
| 98 | + |
| 99 | + # Init params |
| 100 | + self.W_i = theano.shared(gauss_weight(n_in, n_h), 'W_i', borrow=True) |
| 101 | + self.W_f = theano.shared(gauss_weight(n_in, n_h), 'W_f', borrow=True) |
| 102 | + self.W_c = theano.shared(gauss_weight(n_in, n_h), 'W_c', borrow=True) |
| 103 | + self.W_o = theano.shared(gauss_weight(n_in, n_h), 'W_o', borrow=True) |
| 104 | + |
| 105 | + self.U_i = theano.shared(gauss_weight(n_h), 'U_i', borrow=True) |
| 106 | + self.U_f = theano.shared(gauss_weight(n_h), 'U_f', borrow=True) |
| 107 | + self.U_c = theano.shared(gauss_weight(n_h), 'U_c', borrow=True) |
| 108 | + self.U_o = theano.shared(gauss_weight(n_h), 'U_o', borrow=True) |
| 109 | + |
| 110 | + self.b_i = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), |
| 111 | + 'b_i', borrow=True) |
| 112 | + self.b_f = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), |
| 113 | + 'b_f', borrow=True) |
| 114 | + self.b_c = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), |
| 115 | + 'b_c', borrow=True) |
| 116 | + self.b_o = theano.shared(numpy.zeros((n_h,), dtype=config.floatX), |
| 117 | + 'b_o', borrow=True) |
| 118 | + |
| 119 | + self.params = [self.W_i, self.W_f, self.W_c, self.W_o, |
| 120 | + self.U_i, self.U_f, self.U_c, self.U_o, |
| 121 | + self.b_i, self.b_f, self.b_c, self.b_o] |
| 122 | + |
| 123 | + outputs_info = [T.zeros((input.shape[1], n_h)), |
| 124 | + T.zeros((input.shape[1], n_h))] |
| 125 | + |
| 126 | + rval, updates = theano.scan(self._step, |
| 127 | + sequences=[mask, input], |
| 128 | + outputs_info=outputs_info) |
| 129 | + |
| 130 | + # self.output is in the format (batchsize, n_h) |
| 131 | + self.output = rval[0] |
| 132 | + |
| 133 | + def _step(self, m_, x_, h_, c_): |
| 134 | + |
| 135 | + i_preact = (index_dot(x_, self.W_i) + |
| 136 | + T.dot(h_, self.U_i) + self.b_i) |
| 137 | + i = T.nnet.sigmoid(i_preact) |
| 138 | + |
| 139 | + f_preact = (index_dot(x_, self.W_f) + |
| 140 | + T.dot(h_, self.U_f) + self.b_f) |
| 141 | + f = T.nnet.sigmoid(f_preact) |
| 142 | + |
| 143 | + o_preact = (index_dot(x_, self.W_o) + |
| 144 | + T.dot(h_, self.U_o) + self.b_o) |
| 145 | + o = T.nnet.sigmoid(o_preact) |
| 146 | + |
| 147 | + c_preact = (index_dot(x_, self.W_c) + |
| 148 | + T.dot(h_, self.U_c) + self.b_c) |
| 149 | + c = T.tanh(c_preact) |
| 150 | + |
| 151 | + c = f * c_ + i * c |
| 152 | + c = m_[:, None] * c + (1. - m_)[:, None] * c_ |
| 153 | + |
| 154 | + h = o * T.tanh(c) |
| 155 | + h = m_[:, None] * h + (1. - m_)[:, None] * h_ |
| 156 | + |
| 157 | + return h, c |
| 158 | + |
| 159 | + |
| 160 | +def train_model(batch_size=100, n_h=50, n_epochs=40): |
| 161 | + |
| 162 | + # Load the datasets with Fuel |
| 163 | + dictionary = pkl.load(open(DICT_FILE, 'r')) |
| 164 | + dictionary['~'] = len(dictionary) |
| 165 | + reverse_mapping = dict((j, i) for i, j in dictionary.items()) |
| 166 | + |
| 167 | + print("Loading the data") |
| 168 | + train = TextFile(files=[TRAIN_FILE], |
| 169 | + dictionary=dictionary, |
| 170 | + unk_token='~', |
| 171 | + level='character', |
| 172 | + preprocess=str.lower, |
| 173 | + bos_token=None, |
| 174 | + eos_token=None) |
| 175 | + |
| 176 | + train_stream = DataStream.default_stream(train) |
| 177 | + |
| 178 | + # organize data in batches and pad shorter sequences with zeros |
| 179 | + train_stream = Batch(train_stream, |
| 180 | + iteration_scheme=ConstantScheme(batch_size)) |
| 181 | + train_stream = Padding(train_stream) |
| 182 | + |
| 183 | + # idem dito for the validation text |
| 184 | + val = TextFile(files=[VAL_FILE], |
| 185 | + dictionary=dictionary, |
| 186 | + unk_token='~', |
| 187 | + level='character', |
| 188 | + preprocess=str.lower, |
| 189 | + bos_token=None, |
| 190 | + eos_token=None) |
| 191 | + |
| 192 | + val_stream = DataStream.default_stream(val) |
| 193 | + |
| 194 | + # organize data in batches and pad shorter sequences with zeros |
| 195 | + val_stream = Batch(val_stream, |
| 196 | + iteration_scheme=ConstantScheme(batch_size)) |
| 197 | + val_stream = Padding(val_stream) |
| 198 | + |
| 199 | + print('Building model') |
| 200 | + |
| 201 | + # Set the random number generator' seeds for consistency |
| 202 | + rng = numpy.random.RandomState(12345) |
| 203 | + |
| 204 | + x = T.lmatrix('x') |
| 205 | + mask = T.matrix('mask') |
| 206 | + |
| 207 | + # Construct the LSTM layer |
| 208 | + recurrent_layer = LstmLayer(rng=rng, input=x, mask=mask, n_in=111, n_h=n_h) |
| 209 | + |
| 210 | + logreg_layer = LogisticRegression(input=recurrent_layer.output[:-1], |
| 211 | + n_in=n_h, n_out=111) |
| 212 | + |
| 213 | + cost = sequence_categorical_crossentropy(logreg_layer.p_y_given_x, |
| 214 | + x[1:], |
| 215 | + mask[1:]) / batch_size |
| 216 | + |
| 217 | + # create a list of all model parameters to be fit by gradient descent |
| 218 | + params = logreg_layer.params + recurrent_layer.params |
| 219 | + |
| 220 | + # create a list of gradients for all model parameters |
| 221 | + grads = T.grad(cost, params) |
| 222 | + |
| 223 | + # update_model is a function that updates the model parameters by |
| 224 | + # SGD Since this model has many parameters, it would be tedious to |
| 225 | + # manually create an update rule for each model parameter. We thus |
| 226 | + # create the updates list by automatically looping over all |
| 227 | + # (params[i], grads[i]) pairs. |
| 228 | + learning_rate = 0.1 |
| 229 | + updates = [ |
| 230 | + (param_i, param_i - learning_rate * grad_i) |
| 231 | + for param_i, grad_i in zip(params, grads) |
| 232 | + ] |
| 233 | + |
| 234 | + update_model = theano.function([x, mask], cost, updates=updates) |
| 235 | + |
| 236 | + evaluate_model = theano.function([x, mask], cost) |
| 237 | + |
| 238 | + # Define and compile a function for generating a sequence step by step. |
| 239 | + x_t = T.iscalar() |
| 240 | + h_p = T.vector() |
| 241 | + c_p = T.vector() |
| 242 | + h_t, c_t = recurrent_layer._step(T.ones(1), x_t, h_p, c_p) |
| 243 | + energy = T.dot(h_t, logreg_layer.W) + logreg_layer.b |
| 244 | + |
| 245 | + energy_exp = T.exp(energy - T.max(energy, 1)[:, None]) |
| 246 | + |
| 247 | + output = energy_exp / energy_exp.sum(1)[:, None] |
| 248 | + single_step = theano.function([x_t, h_p, c_p], [output, h_t, c_t]) |
| 249 | + |
| 250 | + start_time = time.clock() |
| 251 | + |
| 252 | + iteration = 0 |
| 253 | + |
| 254 | + for epoch in range(n_epochs): |
| 255 | + print 'epoch:', epoch |
| 256 | + |
| 257 | + for x_, mask_ in train_stream.get_epoch_iterator(): |
| 258 | + iteration += 1 |
| 259 | + |
| 260 | + cross_entropy = update_model(x_.T, mask_.T) |
| 261 | + |
| 262 | + |
| 263 | + # Generate some text after each 20 minibatches |
| 264 | + if iteration % 40 == 0: |
| 265 | + try: |
| 266 | + prediction = numpy.ones(111, dtype=config.floatX) / 111.0 |
| 267 | + h_p = numpy.zeros((n_h,), dtype=config.floatX) |
| 268 | + c_p = numpy.zeros((n_h,), dtype=config.floatX) |
| 269 | + initial = 'the meaning of life is ' |
| 270 | + sentence = initial |
| 271 | + for char in initial: |
| 272 | + x_t = dictionary[char] |
| 273 | + prediction, h_p, c_p = single_step(x_t, h_p.flatten(), |
| 274 | + c_p.flatten()) |
| 275 | + sample = numpy.random.multinomial(1, prediction.flatten()) |
| 276 | + for i in range(450): |
| 277 | + x_t = numpy.argmax(sample) |
| 278 | + prediction, h_p, c_p = single_step(x_t, h_p.flatten(), |
| 279 | + c_p.flatten()) |
| 280 | + sentence += reverse_mapping[x_t] |
| 281 | + sample = numpy.random.multinomial(1, prediction.flatten()) |
| 282 | + print 'LSTM: "' + sentence + '"' |
| 283 | + except ValueError: |
| 284 | + print 'Something went wrong during sentence generation.' |
| 285 | + |
| 286 | + if iteration % 40 == 0: |
| 287 | + print 'epoch:', epoch, ' minibatch:', iteration |
| 288 | + val_scores = [] |
| 289 | + for x_val, mask_val in val_stream.get_epoch_iterator(): |
| 290 | + val_scores.append(evaluate_model(x_val.T, mask_val.T)) |
| 291 | + print 'Average validation CE per sentence:', numpy.mean(val_scores) |
| 292 | + |
| 293 | + end_time = time.clock() |
| 294 | + print('Optimization complete.') |
| 295 | + print('The code ran for %.2fm' % ((end_time - start_time) / 60.)) |
| 296 | + |
| 297 | + |
| 298 | +if __name__ == '__main__': |
| 299 | + train_model() |
0 commit comments