add crf_decoding layer (#6274)

jacquesqiao · web-flow · commit 45c8a88a3e0f · 2017-12-05T16:08:32.000+08:00
* add crf_decoding layer

* fix some typo

* fix test_crf_decoding_op
diff --git a/paddle/operators/crf_decoding_op.cc b/paddle/operators/crf_decoding_op.cc
@@ -36,17 +36,18 @@ class CRFDecodingOpMaker : public framework::OpProtoAndCheckerMaker {
         "w. See more details in comments of the linear_chain_crf operator.");
     AddInput(
         "Label",
-        "(LoDTensor,  LoDTensor<int>). The ground truth with shape "
+        "(LoDTensor,  LoDTensor<int64_t>). The ground truth with shape "
         "[N x 1]. This input is optional. See more details in the operator's "
         "comments.")
         .AsDispensable();
-    AddOutput("ViterbiPath",
-              "(LoDTensor, LoDTensor<int>). The decoding results. What to "
-              "return changes depending on whether the Input(Label) (the groud "
-              "truth) is given. See more details in the operator's comment.");
+    AddOutput(
+        "ViterbiPath",
+        "(LoDTensor, LoDTensor<int64_t>). The decoding results. What to "
+        "return changes depending on whether the Input(Label) (the ground "
+        "truth) is given. See more details in the operator's comment.");
     AddComment(R"DOC(
 The crf_decoding operator reads the emission feature weights and the transition
-freature weights learned by the linear_chain_crf operator. It implements the
+feature weights learned by the linear_chain_crf operator. It implements the
 Viterbi algorithm which is a dynamic programming algorithm for finding the most
 likely sequence of hidden states, called the Viterbi path, that results in a
 sequence of observed tags.
@@ -60,14 +61,14 @@ operator.
 
 When Input(Label) is given, the crf_decoding operator returns a row vector
 with shape [N x 1] whose values are fixed to be 0, indicating an incorrect
-prediction, or 1 indicating a tag is correctly predicted. Such an ouput is the
+prediction, or 1 indicating a tag is correctly predicted. Such an output is the
 input to chunk_eval operator.
 
 2. Input(Label) is not given:
 
 This is the standard decoding process.
 
-The crf_decoding operator returns a row vecotr with shape [N x 1] whose values
+The crf_decoding operator returns a row vector with shape [N x 1] whose values
 range from 0 to maximum tag number - 1. Each element indicates an index of a
 predicted tag.
 )DOC");
diff --git a/paddle/operators/crf_decoding_op.h b/paddle/operators/crf_decoding_op.h
@@ -43,9 +43,9 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     const size_t level = 0;
     const size_t seq_num = lod[level].size() - 1;
 
-    int* path = decoded_path->mutable_data<int>(platform::CPUPlace());
-    math::SetConstant<platform::CPUPlace, int>()(ctx.device_context(),
-                                                 decoded_path, 0);
+    int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
+    math::SetConstant<platform::CPUPlace, int64_t>()(ctx.device_context(),
+                                                     decoded_path, 0);
     for (size_t i = 0; i < seq_num; ++i) {
       int start_pos = static_cast<int>(lod[level][i]);
       int end_pos = static_cast<int>(lod[level][i + 1]);
@@ -57,7 +57,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     if (label) {
       PADDLE_ENFORCE_EQ(label->NumLevels(), 1UL,
                         "The Input(Label) should be a sequence.");
-      const int* label_value = label->data<int>();
+      const int64_t* label_value = label->data<int64_t>();
       size_t batch_size = emission_weights->dims()[0];
       for (size_t i = 0; i < batch_size; ++i) {
         path[i] = label_value[i] == path[i] ? 1 : 0;
@@ -76,7 +76,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
 
     const T* x = emission_weights.data<T>();
     const T* w = transition_weights.data<T>();
-    int* path = decoded_path->data<int>();
+    int64_t* path = decoded_path->data<int64_t>();
 
     // alpha is a memo table. An element alpha(k, v) records the score of the
     // best sequence of tags from position 1 to position k with v being the end
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
@@ -237,7 +237,7 @@ def __init__(self,
 
         def find_name(var_list, name):
             for var_name in var_list:
-                if var_name == name:
+                if var_list[var_name] is not None and var_name == name:
                     return True
             return False
 
diff --git a/python/paddle/v2/fluid/layer_helper.py b/python/paddle/v2/fluid/layer_helper.py
@@ -1,7 +1,7 @@
 import copy
 import itertools
 
-from framework import Variable, default_main_program, default_startup_program, \
+from framework import Variable, Parameter, default_main_program, default_startup_program, \
     unique_name, dtype_is_floating
 from paddle.v2.fluid.initializer import Constant, Xavier
 from param_attr import ParamAttr
@@ -122,6 +122,12 @@ def create_parameter(self,
         return self.main_program.global_block().create_parameter(
             dtype=dtype, shape=shape, **attr.to_kwargs())
 
+    def get_parameter(self, name):
+        param = self.main_program.global_block().var(name)
+        if not isinstance(param, Parameter):
+            raise ValueError("no Parameter name %s found" % name)
+        return param
+
     def create_tmp_variable(self, dtype):
         return self.main_program.current_block().create_var(
             name=unique_name(".".join([self.name, 'tmp'])),
diff --git a/python/paddle/v2/fluid/layers.py b/python/paddle/v2/fluid/layers.py
@@ -477,6 +477,24 @@ def linear_chain_crf(input,
     return log_likelihood
 
 
+def crf_decoding(input,
+                 param_attr,
+                 label=None,
+                 main_program=None,
+                 startup_program=None):
+    helper = LayerHelper('crf_decoding', **locals())
+    transition = helper.get_parameter(param_attr.name)
+    viterbi_path = helper.create_tmp_variable(dtype=helper.input_dtype())
+    helper.append_op(
+        type='crf_decoding',
+        inputs={"Emission": [input],
+                "Transition": transition,
+                "Label": label},
+        outputs={"ViterbiPath": [viterbi_path]})
+
+    return viterbi_path
+
+
 def assign(input, output, main_program=None, startup_program=None):
     helper = LayerHelper('assign', **locals())
     helper.append_op(
diff --git a/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book/test_label_semantic_roles.py
@@ -137,12 +137,19 @@ def main():
         param_attr=fluid.ParamAttr(
             name='crfw', learning_rate=mix_hidden_lr))
     avg_cost = fluid.layers.mean(x=crf_cost)
+
     # TODO(qiao)
-    #   1. add crf_decode_layer and evaluator
-    #   2. use other optimizer and check why out will be NAN
+    # check other optimizers and check why out will be NAN
     sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
     sgd_optimizer.minimize(avg_cost)
 
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(name='crfw'))
+
     train_data = paddle.batch(
         paddle.reader.shuffle(
             paddle.dataset.conll05.test(), buf_size=8192),
@@ -168,7 +175,6 @@ def main():
                            feed=feeder.feed(data),
                            fetch_list=[avg_cost])
             avg_cost_val = np.array(outs[0])
-
             if batch_id % 10 == 0:
                 print("avg_cost=" + str(avg_cost_val))
 
diff --git a/python/paddle/v2/fluid/tests/test_crf_decoding_op.py b/python/paddle/v2/fluid/tests/test_crf_decoding_op.py
@@ -20,14 +20,14 @@ def __init__(self, emission_weights, transition_weights,
         self.w = transition_weights[2:, :]
 
         self.track = np.zeros(
-            (seq_start_positions[-1], self.tag_num), dtype="int32")
+            (seq_start_positions[-1], self.tag_num), dtype="int64")
         self.decoded_path = np.zeros(
-            (seq_start_positions[-1], 1), dtype="int32")
+            (seq_start_positions[-1], 1), dtype="int64")
 
     def _decode_one_sequence(self, decoded_path, x):
         seq_len, tag_num = x.shape
         alpha = np.zeros((seq_len, tag_num), dtype="float64")
-        track = np.zeros((seq_len, tag_num), dtype="int32")
+        track = np.zeros((seq_len, tag_num), dtype="int64")
 
         for i in range(tag_num):
             alpha[0, i] = self.a[i] + x[0, i]
@@ -125,10 +125,10 @@ def setUp(self):
             axis=0)
 
         labels = np.random.randint(
-            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int32")
+            low=0, high=TAG_NUM, size=(lod[-1][-1], 1), dtype="int64")
         predicted_labels = np.ones(
-            (lod[-1][-1], 1), dtype="int32") * (TAG_NUM - 1)
-        expected_output = (labels == predicted_labels).astype("int32")
+            (lod[-1][-1], 1), dtype="int64") * (TAG_NUM - 1)
+        expected_output = (labels == predicted_labels).astype("int64")
 
         self.inputs = {
             "Emission": (emission, lod),
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
@@ -4,6 +4,7 @@
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
 from paddle.v2.fluid.framework import Program, program_guard
+from paddle.v2.fluid.param_attr import ParamAttr
 
 
 class TestBook(unittest.TestCase):
@@ -132,8 +133,12 @@ def test_linear_chain_crf(self):
             images = layers.data(name='pixel', shape=[784], dtype='float32')
             label = layers.data(name='label', shape=[1], dtype='int32')
             hidden = layers.fc(input=images, size=128)
-            crf = layers.linear_chain_crf(input=hidden, label=label)
+            crf = layers.linear_chain_crf(
+                input=hidden, label=label, param_attr=ParamAttr(name="crfw"))
+            crf_decode = layers.crf_decoding(
+                input=hidden, param_attr=ParamAttr(name="crfw"))
             self.assertNotEqual(crf, None)
+            self.assertNotEqual(crf_decode, None)
 
         print(str(program))