PaddlePaddle
diff --git a/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/hierarchical_sigmoid_op.cc
Lines changed: 167 additions & 0 deletions b/‎paddle/fluid/operators/hierarchical_sigmoid_op.cc
Lines changed: 167 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/hierarchical_sigmoid_op.h
Lines changed: 135 additions & 0 deletions b/‎paddle/fluid/operators/hierarchical_sigmoid_op.h
Lines changed: 135 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/math/CMakeLists.txt
Lines changed: 1 addition & 0 deletions b/‎paddle/fluid/operators/math/CMakeLists.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎paddle/fluid/operators/math/math_function_impl.h
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/operators/math/math_function_impl.h
Lines changed: 1 addition & 1 deletion
@@ -259,6 +259,7 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ *
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ *   |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ *
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ */
+
+class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
+                   "Output(PreOut) should not be null.");
+    const int64_t batch_size = ctx->GetInputDim("X")[0];
+    std::vector<int64_t> output_shape({batch_size, 1});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+template <typename AttrType>
+class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, required) The input tensor with shape [N, D], "
+             "where N is the size of mini-batch, and D is the feature size.");
+    AddInput("W",
+             "(Tensor, required), The parameters of hierarchical "
+             "sigmoid operator, each of them is a 2-D tensor, the shape is"
+             "[num_classes - 1, D].");
+    AddInput("Label",
+             "(Tensor, required), The labels of training data. It's a"
+             "tensor with shape [N, 1].");
+    AddInput("Bias",
+             "(Tensor, optional), The bias is a tensor with shape"
+             "[1, num_classes - 1].");
+    AddOutput("Out",
+              "(Tensor, required) The output of hierarchical sigmoid operator."
+              "The shape is [N, 1].");
+    AddOutput("PreOut",
+              "(Tensor, required) A intermedia 2-D tensor with shape "
+              "[batch_size, code_length], where code_length represents the "
+              "maximum path length from root to leaf nodes.")
+        .AsIntermediate();
+    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+        .SetDefault(2);
+    AddComment(R"DOC(
+The hierarchical sigmoid operator organize the classes into a binary tree.
+At each node, a sigmoid function is used to calculate the probability of
+belonging to the right branch. This idea is from
+"F. Morin, Y. Bengio (AISTATS 05):
+Hierarchical Probabilistic Neural Network Language Model."
+      )DOC");
+  }
+};
+
+class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PreOut"),
+                   "Input(Preout) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
+                   "Output(W@Grad should not be null.)");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid_grad,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
@@ -0,0 +1,135 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/clip_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/matrix_bit_code.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace operators {
+
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using platform::Transform;
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* bias = ctx.Input<framework::Tensor>("Bias");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    int64_t code_length = math::FindLastSet(num_classes - 1);
+    int64_t batch_size = in->dims()[0];
+    framework::Tensor sum;
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    auto* pre_out_data = pre_out->mutable_data<T>(
+        framework::make_ddim({batch_size, code_length}), ctx.GetPlace());
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    // Not all class(leaf) nodes' path lengths equal code_length, thus init as
+    // 0s can avoid out of path's loss.
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, pre_out, static_cast<T>(0.0));
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    math::RowwiseSum<DeviceContext, T> row_sum;
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    std::vector<int64_t> sum_dims({batch_size, 1UL});
+    sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
+    auto sum_mat = EigenMatrix<T>::From(sum);
+    out->mutable_data<T>(ctx.GetPlace());
+    auto out_mat = framework::EigenVector<T>::Flatten(*out);
+    if (bias) {
+      bit_code.Add(pre_out, *bias);
+    }
+    bit_code.Mul(pre_out, *w, *in);
+    // clip to [-40, 40]
+    Transform<DeviceContext> trans;
+    trans(ctx.template device_context<DeviceContext>(), pre_out_data,
+          pre_out_data + pre_out->numel(), pre_out_data,
+          ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
+    bit_code.Sum(*pre_out, out, static_cast<T>(-1));
+    // use softrelu to calculate cross entropy
+    pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
+    row_sum(dev_ctx, *pre_out, &sum);
+    // TODO(guosheng): Subtract the out of path's loss, since not all
+    // class(leaf) nodes' path lengths equal code_length. But it won't break the
+    // gradient check since both have the out of path's loss and will cancel out
+    // each other.
+    out_mat.device(place) = sum_mat + out_mat;
+  }
+};
+
+template <typename DeviceContext, typename T>
+class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::Tensor>("X");
+    auto* w = ctx.Input<framework::Tensor>("W");
+    auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
+    auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
+    auto* bias_grad =
+        ctx.Output<framework::Tensor>(framework::GradVarName("Bias"));
+    auto* label = ctx.Input<framework::Tensor>("Label");
+    auto* pre_out = ctx.Input<framework::Tensor>("PreOut");
+    auto* out_grad =
+        ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
+    framework::Tensor pre_out_grad;
+
+    pre_out_grad.mutable_data<T>(pre_out->dims(), ctx.GetPlace());
+    in_grad->mutable_data<T>(ctx.GetPlace());
+    w_grad->mutable_data<T>(ctx.GetPlace());
+    auto& dev_ctx = ctx.template device_context<DeviceContext>();
+    math::SetConstant<DeviceContext, T> zero;
+    zero(dev_ctx, in_grad, static_cast<T>(0.0));
+    zero(dev_ctx, w_grad, static_cast<T>(0.0));
+
+    size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
+    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
+    auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
+    auto pre_out_grad_mat = EigenMatrix<T>::From(pre_out_grad);
+    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    Eigen::array<int, 2> bcast({{1, static_cast<int>(pre_out_grad.dims()[1])}});
+
+    // softrelu derivative
+    pre_out_grad_mat.device(place) =
+        static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
+    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+    pre_out_grad_mat.device(place) =
+        pre_out_grad_mat * out_grad_mat.broadcast(bcast);
+    // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
+    // be consistent with the clipping in forward.
+    if (bias_grad) {
+      bias_grad->mutable_data<T>(ctx.GetPlace());
+      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+      bit_code.AddGrad(pre_out_grad, bias_grad);
+    }
+    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
+    bit_code.MulGradError(pre_out_grad, *w, in_grad);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
@@ -51,6 +51,7 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+math_library(matrix_bit_code)
 math_library(unpooling)
 math_library(vol2col)
 
 
@@ -155,7 +155,7 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
     PADDLE_ENFORCE_EQ(in_dims.size(), 2U);
     auto height = in_dims[0];
     auto size = in_dims[1];
-    PADDLE_ENFORCE_EQ(out->numel(), size);
+    PADDLE_ENFORCE_EQ(out->numel(), height);
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();