Fix calculations in gru_unit_op

guoshengCS · guoshengCS · commit 3e7fff4188b0 · 2017-11-20T12:34:42.000+08:00
diff --git a/paddle/operators/gru_unit_op.cc b/paddle/operators/gru_unit_op.cc
@@ -114,18 +114,19 @@ class GRUUnitOpMaker : public framework::OpProtoAndCheckerMaker {
         .SetDefault(sigmoid)
         .InEnum({identity, sigmoid, tanh, relu});
     AddComment(R"DOC(
-GRUUnit Operator.
-
-This operator implements partial calculations of the GRU unit as follows:
+GRUUnit Operator implements partial calculations of the GRU unit as following:
 
 $$
-update \ gate: u_t = actGate(xu_t + W_u * hidden_{prev} + bias_u) \\
-reset \ gate: r_t = actGate(xr_t + W_r * hidden_{prev} + bias_r)  \\
-output \ candidate: {h}_t = actNode({xc}_t + W_c * dot(r_t, hidden_{prev}) + bias_c) \\
-output: h_t = dot((1-u_t), {h}_t) + dot(u_t, hidden_{prev})
+update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\
+reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r)  \\
+output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\
+output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
 $$
 
-The rest of GRU unit can be completed by using FCOp's output as the input of GRUUnitOp.
+which is same as one time step of GRU Operator.
+
+@note To implement the complete GRU unit, fully-connected operator must be 
+used before to feed xu, xr and xc as the Input of GRUUnit operator.
 
 )DOC");
   }
@@ -150,12 +151,6 @@ class GRUUnitGradOp : public framework::OperatorWithKernel {
                    "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput("Hidden"),
                    "Input(%s) of GRUUnitGradOp should not be null.", "Hidden");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Gate")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "Gate");
-    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("ResetHiddenPrev")),
-                   "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
-                   "ResetHiddenPrev");
     PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Hidden")),
                    "Input(%s@GRAD) of GRUUnitGradOp should not be null.",
                    "Hidden");
diff --git a/paddle/operators/gru_unit_op.h b/paddle/operators/gru_unit_op.h
@@ -110,7 +110,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
     auto c = g.slice(c_offsets, extents);  // output candidate
 
     // calculate final output
-    h.device(place) = u * (h_p - c) + c;
+    h.device(place) = u * (c - h_p) + h_p;
   }
 };
 
@@ -185,10 +185,10 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
 
     // backward for unactivated update gate
     ActGradCompute(context.Attr<int>("gate_activation"), place, u, u,
-                   d_g.slice(u_offsets, extents), d_h * (h_p - c));
+                   d_g.slice(u_offsets, extents), d_h * (c - h_p));
     // backward for unactivated output candidate
     ActGradCompute(context.Attr<int>("activation"), place, c, c,
-                   d_g.slice(c_offsets, extents), d_h * (u.constant(T(1)) - u));
+                   d_g.slice(c_offsets, extents), d_h * u);
     // backward for reset_hidden_prev
     math::gemm<Place, T>(context.device_context(), false, true, batch_size,
                          frame_size, frame_size, 1,
@@ -210,7 +210,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
                          frame_size, gate_grad_data, frame_size * 3, 0,
                          weight_grad_data, frame_size * 2);
     // backward for hidden_prev
-    d_h_p.device(place) = d_r_h_p * r + d_h * u;
+    d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
     math::gemm<Place, T>(context.device_context(), false, true, batch_size,
                          frame_size, frame_size * 2, 1, gate_grad_data,
                          frame_size * 3, weight_data, frame_size * 2, 1,
diff --git a/python/paddle/v2/fluid/tests/test_gru_unit_op.py b/python/paddle/v2/fluid/tests/test_gru_unit_op.py
@@ -77,7 +77,7 @@ def set_outputs(self):
         c = self.activate[self.attrs['activation']](np.dot(r_h_p, w_c) +
                                                     g[:, frame_size * 2:])
         g = np.hstack((u_r, c))
-        h = u * h_p + (1 - u) * c
+        h = u * c + (1 - u) * h_p
         self.outputs = {
             'Gate': g.astype('float64'),
             'ResetHiddenPrev': r_h_p.astype('float64'),
@@ -93,8 +93,7 @@ def test_check_output(self):
 
     def test_check_grad(self):
         self.check_grad(
-            ['Input', 'HiddenPrev', 'Weight'],
-            ['Hidden', 'ResetHiddenPrev', 'Gate'],
+            ['Input', 'HiddenPrev', 'Weight'], ['Hidden'],
             max_relative_error=0.007)
 
 
@@ -104,7 +103,7 @@ def set_inputs(self):
         frame_size = self.frame_size
         super(TestGRUUnitOpWithBias, self).set_inputs()
         self.inputs['Bias'] = np.random.uniform(
-            -0.1, 0.1, (1, frame_size * 3)).astype('float32')
+            -0.1, 0.1, (1, frame_size * 3)).astype('float64')
         self.attrs = {
             'activation': GRUActivationType.identity,
             'gate_activation': GRUActivationType.sigmoid
@@ -117,5 +116,4 @@ def test_check_grad(self):
 
 
 if __name__ == '__main__':
-    exit(0)  # FIXME(yuyang18): This unittest is not pass. Fix it later
     unittest.main()