Nmt decoder train (#6367)

jacquesqiao · web-flow · commit 36fcc95cabdd · 2017-12-08T13:11:11.000+08:00
* init decoder_trainer

* can run

* fix lod

* add sharelod to cross_entropy_grad_op

* add avg_cost to fetch list

* modify learning rate

* can run

* optimie code

* add early exit

* fix print

* revert test_understand_sentiment_conv.py

* add act to fc
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
     auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
     if (in_var->GetType() != VarDesc::LOD_TENSOR) {
-      VLOG(3) << "input " << in << "is not LodTensor";
+      VLOG(3) << "input " << in << " is not LodTensor";
       return;
     }
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
diff --git a/paddle/operators/concat_op.cc b/paddle/operators/concat_op.cc
@@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel {
       for (size_t j = 0; j < in_zero_dims_size; j++) {
         if (j == axis) {
           out_dims[axis] += ins[i][j];
-          continue;
+        } else {
+          PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
+                            "Input tensors should have the same "
+                            "elements except the specify axis.");
         }
-        PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],
-                          "Input tensors should have the same "
-                          "elements except the specify axis.");
       }
     }
+    if (out_dims[axis] < 0) {
+      out_dims[axis] = -1;
+    }
     ctx->SetOutputDim("Out", out_dims);
+    ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
 
diff --git a/paddle/operators/cross_entropy_op.cc b/paddle/operators/cross_entropy_op.cc
@@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
                         "Input(Label) should be 1.");
     }
     ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
+    ctx->ShareLoD("X", framework::GradVarName("X"));
   }
 
  protected:
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
@@ -430,7 +430,8 @@ def infer_and_check_dtype(op_proto, **kwargs):
                     dtype = each.dtype
                 elif dtype != each.dtype:
                     raise ValueError(
-                        "operator {0} must input same dtype".format(op_type))
+                        "operator {0} must input same dtype. {1} vs {2}".format(
+                            op_type, dtype, each.dtype))
 
         return dtype
 
diff --git a/python/paddle/v2/fluid/tests/book/test_machine_translation.py b/python/paddle/v2/fluid/tests/book/test_machine_translation.py
@@ -1,59 +1,62 @@
 import numpy as np
 import paddle.v2 as paddle
-import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
 import paddle.v2.fluid.core as core
 import paddle.v2.fluid.framework as framework
 import paddle.v2.fluid.layers as layers
-from paddle.v2.fluid.executor import Executor, g_scope
-from paddle.v2.fluid.optimizer import SGDOptimizer
-import paddle.v2.fluid as fluid
-import paddle.v2.fluid.layers as pd
+from paddle.v2.fluid.executor import Executor
 
 dict_size = 30000
 source_dict_dim = target_dict_dim = dict_size
 src_dict, trg_dict = paddle.dataset.wmt14.get_dict(dict_size)
-hidden_dim = 512
-word_dim = 512
+hidden_dim = 32
+word_dim = 16
 IS_SPARSE = True
-batch_size = 50
+batch_size = 10
 max_length = 50
 topk_size = 50
 trg_dic_size = 10000
 
-src_word_id = layers.data(name="src_word_id", shape=[1], dtype='int64')
-src_embedding = layers.embedding(
-    input=src_word_id,
-    size=[dict_size, word_dim],
-    dtype='float32',
-    is_sparse=IS_SPARSE,
-    param_attr=fluid.ParamAttr(name='vemb'))
-
-
-def encoder():
-
-    lstm_hidden0, lstm_0 = layers.dynamic_lstm(
-        input=src_embedding,
-        size=hidden_dim,
-        candidate_activation='sigmoid',
-        cell_activation='sigmoid')
-
-    lstm_hidden1, lstm_1 = layers.dynamic_lstm(
-        input=src_embedding,
-        size=hidden_dim,
-        candidate_activation='sigmoid',
-        cell_activation='sigmoid',
-        is_reverse=True)
-
-    bidirect_lstm_out = layers.concat([lstm_hidden0, lstm_hidden1], axis=0)
-
-    return bidirect_lstm_out
-
-
-def decoder_trainer(context):
-    '''
-    decoder with trainer
-    '''
-    pass
+decoder_size = hidden_dim
+
+
+def encoder_decoder():
+    # encoder
+    src_word_id = layers.data(
+        name="src_word_id", shape=[1], dtype='int64', lod_level=1)
+    src_embedding = layers.embedding(
+        input=src_word_id,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    fc1 = fluid.layers.fc(input=src_embedding, size=hidden_dim * 4, act='tanh')
+    lstm_hidden0, lstm_0 = layers.dynamic_lstm(input=fc1, size=hidden_dim * 4)
+    encoder_out = layers.sequence_pool(input=lstm_hidden0, pool_type="last")
+
+    # decoder
+    trg_language_word = layers.data(
+        name="target_language_word", shape=[1], dtype='int64', lod_level=1)
+    trg_embedding = layers.embedding(
+        input=trg_language_word,
+        size=[dict_size, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr=fluid.ParamAttr(name='vemb'))
+
+    rnn = fluid.layers.DynamicRNN()
+    with rnn.block():
+        current_word = rnn.step_input(trg_embedding)
+        mem = rnn.memory(init=encoder_out)
+        fc1 = fluid.layers.fc(input=[current_word, mem],
+                              size=decoder_size,
+                              act='tanh')
+        out = fluid.layers.fc(input=fc1, size=target_dict_dim, act='softmax')
+        rnn.update_memory(mem, fc1)
+        rnn.output(out)
+
+    return rnn()
 
 
 def to_lodtensor(data, place):
@@ -72,13 +75,18 @@ def to_lodtensor(data, place):
 
 
 def main():
-    encoder_out = encoder()
-    # TODO(jacquesqiao) call here
-    decoder_trainer(encoder_out)
+    rnn_out = encoder_decoder()
+    label = layers.data(
+        name="target_language_next_word", shape=[1], dtype='int64', lod_level=1)
+    cost = layers.cross_entropy(input=rnn_out, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    optimizer = fluid.optimizer.Adagrad(learning_rate=1e-4)
+    optimizer.minimize(avg_cost)
 
     train_data = paddle.batch(
         paddle.reader.shuffle(
-            paddle.dataset.wmt14.train(8000), buf_size=1000),
+            paddle.dataset.wmt14.train(dict_size), buf_size=1000),
         batch_size=batch_size)
 
     place = core.CPUPlace()
@@ -88,15 +96,23 @@ def main():
 
     batch_id = 0
     for pass_id in xrange(2):
-        print 'pass_id', pass_id
         for data in train_data():
-            print 'batch', batch_id
-            batch_id += 1
-            if batch_id > 10: break
             word_data = to_lodtensor(map(lambda x: x[0], data), place)
+            trg_word = to_lodtensor(map(lambda x: x[1], data), place)
+            trg_word_next = to_lodtensor(map(lambda x: x[2], data), place)
             outs = exe.run(framework.default_main_program(),
-                           feed={'src_word_id': word_data, },
-                           fetch_list=[encoder_out])
+                           feed={
+                               'src_word_id': word_data,
+                               'target_language_word': trg_word,
+                               'target_language_next_word': trg_word_next
+                           },
+                           fetch_list=[avg_cost])
+            avg_cost_val = np.array(outs[0])
+            print('pass_id=' + str(pass_id) + ' batch=' + str(batch_id) +
+                  " avg_cost=" + str(avg_cost_val))
+            if batch_id > 3:
+                exit(0)
+            batch_id += 1
 
 
 if __name__ == '__main__':

Original file line number	Diff line number	Diff line change
`@@ -59,7 +59,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {`
`59`	`59`	`auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);`
`60`	`60`	`auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);`
`61`	`61`	`if (in_var->GetType() != VarDesc::LOD_TENSOR) {`
`62`		`- VLOG(3) << "input " << in << "is not LodTensor";`
	`62`	`+ VLOG(3) << "input " << in << " is not LodTensor";`
`63`	`63`	`return;`
`64`	`64`	`}`
`65`	`65`	`PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,`
Original file line number	Diff line number	Diff line change
`@@ -41,14 +41,18 @@ class ConcatOp : public framework::OperatorWithKernel {`
`41`	`41`	`for (size_t j = 0; j < in_zero_dims_size; j++) {`
`42`	`42`	`if (j == axis) {`
`43`	`43`	`out_dims[axis] += ins[i][j];`
`44`		`- continue;`
	`44`	`+ } else {`
	`45`	`+ PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],`
	`46`	`+ "Input tensors should have the same "`
	`47`	`+ "elements except the specify axis.");`
`45`	`48`	`}`
`46`		`- PADDLE_ENFORCE_EQ(out_dims[j], ins[i][j],`
`47`		`- "Input tensors should have the same "`
`48`		`- "elements except the specify axis.");`
`49`	`49`	`}`
`50`	`50`	`}`
	`51`	`+ if (out_dims[axis] < 0) {`
	`52`	`+ out_dims[axis] = -1;`
	`53`	`+ }`
`51`	`54`	`ctx->SetOutputDim("Out", out_dims);`
	`55`	`+ ctx->ShareLoD("X", /->/ "Out");`
`52`	`56`	`}`
`53`	`57`	`};`
`54`	`58`
Original file line number	Diff line number	Diff line change
`@@ -95,6 +95,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {`
`95`	`95`	`"Input(Label) should be 1.");`
`96`	`96`	`}`
`97`	`97`	`ctx->SetOutputDim(framework::GradVarName("X"), x_dims);`
	`98`	`+ ctx->ShareLoD("X", framework::GradVarName("X"));`
`98`	`99`	`}`
`99`	`100`
`100`	`101`	`protected:`