|
| 1 | +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | +"""seq2seq model for fluid.""" |
| 15 | +from __future__ import absolute_import |
| 16 | +from __future__ import division |
| 17 | +from __future__ import print_function |
| 18 | + |
| 19 | +import numpy as np |
| 20 | +import argparse |
| 21 | +import time |
| 22 | +import distutils.util |
| 23 | + |
| 24 | +import paddle.v2 as paddle |
| 25 | +import paddle.fluid as fluid |
| 26 | +import paddle.fluid.core as core |
| 27 | +import paddle.fluid.framework as framework |
| 28 | +from paddle.fluid.executor import Executor |
| 29 | + |
| 30 | +parser = argparse.ArgumentParser(description=__doc__) |
| 31 | +parser.add_argument( |
| 32 | + "--embedding_dim", |
| 33 | + type=int, |
| 34 | + default=512, |
| 35 | + help="The dimension of embedding table. (default: %(default)d)") |
| 36 | +parser.add_argument( |
| 37 | + "--encoder_size", |
| 38 | + type=int, |
| 39 | + default=512, |
| 40 | + help="The size of encoder bi-rnn unit. (default: %(default)d)") |
| 41 | +parser.add_argument( |
| 42 | + "--decoder_size", |
| 43 | + type=int, |
| 44 | + default=512, |
| 45 | + help="The size of decoder rnn unit. (default: %(default)d)") |
| 46 | +parser.add_argument( |
| 47 | + "--batch_size", |
| 48 | + type=int, |
| 49 | + default=16, |
| 50 | + help="The sequence number of a mini-batch data. (default: %(default)d)") |
| 51 | +parser.add_argument( |
| 52 | + "--dict_size", |
| 53 | + type=int, |
| 54 | + default=30000, |
| 55 | + help="The dictionary capacity. Dictionaries of source sequence and " |
| 56 | + "target dictionary have same capacity. (default: %(default)d)") |
| 57 | +parser.add_argument( |
| 58 | + "--pass_num", |
| 59 | + type=int, |
| 60 | + default=2, |
| 61 | + help="The pass number to train. (default: %(default)d)") |
| 62 | +parser.add_argument( |
| 63 | + "--learning_rate", |
| 64 | + type=float, |
| 65 | + default=0.0002, |
| 66 | + help="Learning rate used to train the model. (default: %(default)f)") |
| 67 | +parser.add_argument( |
| 68 | + "--infer_only", action='store_true', help="If set, run forward only.") |
| 69 | +parser.add_argument( |
| 70 | + "--beam_size", |
| 71 | + type=int, |
| 72 | + default=3, |
| 73 | + help="The width for beam searching. (default: %(default)d)") |
| 74 | +parser.add_argument( |
| 75 | + "--use_gpu", |
| 76 | + type=distutils.util.strtobool, |
| 77 | + default=True, |
| 78 | + help="Whether to use gpu. (default: %(default)d)") |
| 79 | +parser.add_argument( |
| 80 | + "--max_length", |
| 81 | + type=int, |
| 82 | + default=250, |
| 83 | + help="The maximum length of sequence when doing generation. " |
| 84 | + "(default: %(default)d)") |
| 85 | + |
| 86 | + |
| 87 | +def lstm_step(x_t, hidden_t_prev, cell_t_prev, size): |
| 88 | + def linear(inputs): |
| 89 | + return fluid.layers.fc(input=inputs, size=size, bias_attr=True) |
| 90 | + |
| 91 | + forget_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) |
| 92 | + input_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) |
| 93 | + output_gate = fluid.layers.sigmoid(x=linear([hidden_t_prev, x_t])) |
| 94 | + cell_tilde = fluid.layers.tanh(x=linear([hidden_t_prev, x_t])) |
| 95 | + |
| 96 | + cell_t = fluid.layers.sums(input=[ |
| 97 | + fluid.layers.elementwise_mul( |
| 98 | + x=forget_gate, y=cell_t_prev), fluid.layers.elementwise_mul( |
| 99 | + x=input_gate, y=cell_tilde) |
| 100 | + ]) |
| 101 | + |
| 102 | + hidden_t = fluid.layers.elementwise_mul( |
| 103 | + x=output_gate, y=fluid.layers.tanh(x=cell_t)) |
| 104 | + |
| 105 | + return hidden_t, cell_t |
| 106 | + |
| 107 | + |
| 108 | +def seq_to_seq_net(embedding_dim, encoder_size, decoder_size, source_dict_dim, |
| 109 | + target_dict_dim, is_generating, beam_size, max_length): |
| 110 | + """Construct a seq2seq network.""" |
| 111 | + |
| 112 | + def bi_lstm_encoder(input_seq, gate_size): |
| 113 | + # Linear transformation part for input gate, output gate, forget gate |
| 114 | + # and cell activation vectors need be done outside of dynamic_lstm. |
| 115 | + # So the output size is 4 times of gate_size. |
| 116 | + input_forward_proj = fluid.layers.fc(input=input_seq, |
| 117 | + size=gate_size * 4, |
| 118 | + act=None, |
| 119 | + bias_attr=False) |
| 120 | + forward, _ = fluid.layers.dynamic_lstm( |
| 121 | + input=input_forward_proj, size=gate_size * 4, use_peepholes=False) |
| 122 | + input_reversed_proj = fluid.layers.fc(input=input_seq, |
| 123 | + size=gate_size * 4, |
| 124 | + act=None, |
| 125 | + bias_attr=False) |
| 126 | + reversed, _ = fluid.layers.dynamic_lstm( |
| 127 | + input=input_reversed_proj, |
| 128 | + size=gate_size * 4, |
| 129 | + is_reverse=True, |
| 130 | + use_peepholes=False) |
| 131 | + return forward, reversed |
| 132 | + |
| 133 | + src_word_idx = fluid.layers.data( |
| 134 | + name='source_sequence', shape=[1], dtype='int64', lod_level=1) |
| 135 | + |
| 136 | + src_embedding = fluid.layers.embedding( |
| 137 | + input=src_word_idx, |
| 138 | + size=[source_dict_dim, embedding_dim], |
| 139 | + dtype='float32') |
| 140 | + |
| 141 | + src_forward, src_reversed = bi_lstm_encoder( |
| 142 | + input_seq=src_embedding, gate_size=encoder_size) |
| 143 | + |
| 144 | + encoded_vector = fluid.layers.concat( |
| 145 | + input=[src_forward, src_reversed], axis=1) |
| 146 | + |
| 147 | + encoded_proj = fluid.layers.fc(input=encoded_vector, |
| 148 | + size=decoder_size, |
| 149 | + bias_attr=False) |
| 150 | + |
| 151 | + backward_first = fluid.layers.sequence_pool( |
| 152 | + input=src_reversed, pool_type='first') |
| 153 | + |
| 154 | + decoder_boot = fluid.layers.fc(input=backward_first, |
| 155 | + size=decoder_size, |
| 156 | + bias_attr=False, |
| 157 | + act='tanh') |
| 158 | + |
| 159 | + def lstm_decoder_with_attention(target_embedding, encoder_vec, encoder_proj, |
| 160 | + decoder_boot, decoder_size): |
| 161 | + def simple_attention(encoder_vec, encoder_proj, decoder_state): |
| 162 | + decoder_state_proj = fluid.layers.fc(input=decoder_state, |
| 163 | + size=decoder_size, |
| 164 | + bias_attr=False) |
| 165 | + decoder_state_expand = fluid.layers.sequence_expand( |
| 166 | + x=decoder_state_proj, y=encoder_proj) |
| 167 | + concated = fluid.layers.concat( |
| 168 | + input=[encoder_proj, decoder_state_expand], axis=1) |
| 169 | + attention_weights = fluid.layers.fc(input=concated, |
| 170 | + size=1, |
| 171 | + act='tanh', |
| 172 | + bias_attr=False) |
| 173 | + attention_weights = fluid.layers.sequence_softmax( |
| 174 | + input=attention_weights) |
| 175 | + weigths_reshape = fluid.layers.reshape( |
| 176 | + x=attention_weights, shape=[-1]) |
| 177 | + scaled = fluid.layers.elementwise_mul( |
| 178 | + x=encoder_vec, y=weigths_reshape, axis=0) |
| 179 | + context = fluid.layers.sequence_pool(input=scaled, pool_type='sum') |
| 180 | + return context |
| 181 | + |
| 182 | + rnn = fluid.layers.DynamicRNN() |
| 183 | + |
| 184 | + cell_init = fluid.layers.fill_constant_batch_size_like( |
| 185 | + input=decoder_boot, |
| 186 | + value=0.0, |
| 187 | + shape=[-1, decoder_size], |
| 188 | + dtype='float32') |
| 189 | + cell_init.stop_gradient = False |
| 190 | + |
| 191 | + with rnn.block(): |
| 192 | + current_word = rnn.step_input(target_embedding) |
| 193 | + encoder_vec = rnn.static_input(encoder_vec) |
| 194 | + encoder_proj = rnn.static_input(encoder_proj) |
| 195 | + hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True) |
| 196 | + cell_mem = rnn.memory(init=cell_init) |
| 197 | + context = simple_attention(encoder_vec, encoder_proj, hidden_mem) |
| 198 | + decoder_inputs = fluid.layers.concat( |
| 199 | + input=[context, current_word], axis=1) |
| 200 | + h, c = lstm_step(decoder_inputs, hidden_mem, cell_mem, decoder_size) |
| 201 | + rnn.update_memory(hidden_mem, h) |
| 202 | + rnn.update_memory(cell_mem, c) |
| 203 | + out = fluid.layers.fc(input=h, |
| 204 | + size=target_dict_dim, |
| 205 | + bias_attr=True, |
| 206 | + act='softmax') |
| 207 | + rnn.output(out) |
| 208 | + return rnn() |
| 209 | + |
| 210 | + if not is_generating: |
| 211 | + trg_word_idx = fluid.layers.data( |
| 212 | + name='target_sequence', shape=[1], dtype='int64', lod_level=1) |
| 213 | + |
| 214 | + trg_embedding = fluid.layers.embedding( |
| 215 | + input=trg_word_idx, |
| 216 | + size=[target_dict_dim, embedding_dim], |
| 217 | + dtype='float32') |
| 218 | + |
| 219 | + prediction = lstm_decoder_with_attention(trg_embedding, encoded_vector, |
| 220 | + encoded_proj, decoder_boot, |
| 221 | + decoder_size) |
| 222 | + label = fluid.layers.data( |
| 223 | + name='label_sequence', shape=[1], dtype='int64', lod_level=1) |
| 224 | + cost = fluid.layers.cross_entropy(input=prediction, label=label) |
| 225 | + avg_cost = fluid.layers.mean(x=cost) |
| 226 | + |
| 227 | + feeding_list = ["source_sequence", "target_sequence", "label_sequence"] |
| 228 | + |
| 229 | + return avg_cost, feeding_list |
| 230 | + |
| 231 | + |
| 232 | +def to_lodtensor(data, place): |
| 233 | + seq_lens = [len(seq) for seq in data] |
| 234 | + cur_len = 0 |
| 235 | + lod = [cur_len] |
| 236 | + for l in seq_lens: |
| 237 | + cur_len += l |
| 238 | + lod.append(cur_len) |
| 239 | + flattened_data = np.concatenate(data, axis=0).astype("int64") |
| 240 | + flattened_data = flattened_data.reshape([len(flattened_data), 1]) |
| 241 | + lod_t = core.LoDTensor() |
| 242 | + lod_t.set(flattened_data, place) |
| 243 | + lod_t.set_lod([lod]) |
| 244 | + return lod_t, lod[-1] |
| 245 | + |
| 246 | + |
| 247 | +def lodtensor_to_ndarray(lod_tensor): |
| 248 | + dims = lod_tensor.get_dims() |
| 249 | + ndarray = np.zeros(shape=dims).astype('float32') |
| 250 | + for i in xrange(np.product(dims)): |
| 251 | + ndarray.ravel()[i] = lod_tensor.get_float_element(i) |
| 252 | + return ndarray |
| 253 | + |
| 254 | + |
| 255 | +def train(): |
| 256 | + avg_cost, feeding_list = seq_to_seq_net( |
| 257 | + args.embedding_dim, |
| 258 | + args.encoder_size, |
| 259 | + args.decoder_size, |
| 260 | + args.dict_size, |
| 261 | + args.dict_size, |
| 262 | + False, |
| 263 | + beam_size=args.beam_size, |
| 264 | + max_length=args.max_length) |
| 265 | + |
| 266 | + # clone from default main program |
| 267 | + inference_program = fluid.default_main_program().clone() |
| 268 | + |
| 269 | + optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) |
| 270 | + optimizer.minimize(avg_cost) |
| 271 | + |
| 272 | + fluid.memory_optimize(fluid.default_main_program()) |
| 273 | + |
| 274 | + train_batch_generator = paddle.batch( |
| 275 | + paddle.reader.shuffle( |
| 276 | + paddle.dataset.wmt14.train(args.dict_size), buf_size=1000), |
| 277 | + batch_size=args.batch_size) |
| 278 | + |
| 279 | + test_batch_generator = paddle.batch( |
| 280 | + paddle.reader.shuffle( |
| 281 | + paddle.dataset.wmt14.test(args.dict_size), buf_size=1000), |
| 282 | + batch_size=args.batch_size) |
| 283 | + |
| 284 | + place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() |
| 285 | + exe = Executor(place) |
| 286 | + exe.run(framework.default_startup_program()) |
| 287 | + |
| 288 | + def do_validation(): |
| 289 | + total_loss = 0.0 |
| 290 | + count = 0 |
| 291 | + for batch_id, data in enumerate(test_batch_generator()): |
| 292 | + src_seq = to_lodtensor(map(lambda x: x[0], data), place)[0] |
| 293 | + trg_seq = to_lodtensor(map(lambda x: x[1], data), place)[0] |
| 294 | + lbl_seq = to_lodtensor(map(lambda x: x[2], data), place)[0] |
| 295 | + |
| 296 | + fetch_outs = exe.run(inference_program, |
| 297 | + feed={ |
| 298 | + feeding_list[0]: src_seq, |
| 299 | + feeding_list[1]: trg_seq, |
| 300 | + feeding_list[2]: lbl_seq |
| 301 | + }, |
| 302 | + fetch_list=[avg_cost], |
| 303 | + return_numpy=False) |
| 304 | + |
| 305 | + total_loss += lodtensor_to_ndarray(fetch_outs[0])[0] |
| 306 | + count += 1 |
| 307 | + |
| 308 | + return total_loss / count |
| 309 | + |
| 310 | + for pass_id in xrange(args.pass_num): |
| 311 | + pass_start_time = time.time() |
| 312 | + words_seen = 0 |
| 313 | + for batch_id, data in enumerate(train_batch_generator()): |
| 314 | + src_seq, word_num = to_lodtensor(map(lambda x: x[0], data), place) |
| 315 | + words_seen += word_num |
| 316 | + trg_seq, word_num = to_lodtensor(map(lambda x: x[1], data), place) |
| 317 | + words_seen += word_num |
| 318 | + lbl_seq, _ = to_lodtensor(map(lambda x: x[2], data), place) |
| 319 | + |
| 320 | + fetch_outs = exe.run(framework.default_main_program(), |
| 321 | + feed={ |
| 322 | + feeding_list[0]: src_seq, |
| 323 | + feeding_list[1]: trg_seq, |
| 324 | + feeding_list[2]: lbl_seq |
| 325 | + }, |
| 326 | + fetch_list=[avg_cost]) |
| 327 | + |
| 328 | + avg_cost_val = np.array(fetch_outs[0]) |
| 329 | + print('pass_id=%d, batch_id=%d, train_loss: %f' % |
| 330 | + (pass_id, batch_id, avg_cost_val)) |
| 331 | + |
| 332 | + pass_end_time = time.time() |
| 333 | + test_loss = do_validation() |
| 334 | + time_consumed = pass_end_time - pass_start_time |
| 335 | + words_per_sec = words_seen / time_consumed |
| 336 | + print("pass_id=%d, test_loss: %f, words/s: %f, sec/pass: %f" % |
| 337 | + (pass_id, test_loss, words_per_sec, time_consumed)) |
| 338 | + |
| 339 | + |
| 340 | +def infer(): |
| 341 | + pass |
| 342 | + |
| 343 | + |
| 344 | +if __name__ == '__main__': |
| 345 | + args = parser.parse_args() |
| 346 | + if args.infer_only: |
| 347 | + infer() |
| 348 | + else: |
| 349 | + train() |
0 commit comments