PaddlePaddle
diff --git a/‎paddle/fluid/framework/selected_rows.h
Lines changed: 2 additions & 1 deletion b/‎paddle/fluid/framework/selected_rows.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/hierarchical_sigmoid_op.cc
Lines changed: 10 additions & 1 deletion b/‎paddle/fluid/operators/hierarchical_sigmoid_op.cc
Lines changed: 10 additions & 1 deletion
diff --git a/‎paddle/fluid/operators/hierarchical_sigmoid_op.h
Lines changed: 45 additions & 10 deletions b/‎paddle/fluid/operators/hierarchical_sigmoid_op.h
Lines changed: 45 additions & 10 deletions
diff --git a/‎paddle/fluid/operators/math/matrix_bit_code.cc
Lines changed: 21 additions & 28 deletions b/‎paddle/fluid/operators/math/matrix_bit_code.cc
Lines changed: 21 additions & 28 deletions
@@ -133,7 +133,8 @@ class SelectedRows {
   // SelectedRows are simply concated when adding together. Until a
   // SelectedRows add a Tensor, will the duplicate rows be handled.
   Vector<int64_t> rows_;
-  std::unordered_map<int64_t, int64_t> id_to_index_;
+  std::unordered_map<int64_t, int64_t>
+      id_to_index_;  // should not be used when ids has duplicate member
   std::unique_ptr<Tensor> value_{nullptr};
   int64_t height_;
   std::unique_ptr<RWLock> rwlock_{nullptr};
 
@@ -91,10 +91,19 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
     AddInput("W",
              "(Tensor, required), The parameters of hierarchical "
              "sigmoid operator, each of them is a 2-D tensor, the shape is"
-             "[num_classes - 1, D].");
+             "[K, D]. Which K is the num of non-leaf node in Path Tree");
     AddInput("Label",
              "(Tensor, required), The labels of training data. It's a"
              "tensor with shape [N, 1].");
+    AddInput("PTable",
+             "(Tensor, optional), The Path Table from root to current word"
+             "it should have shape like [N, L], L is the length of the Path")
+        .AsDispensable();
+    AddInput("PCode",
+             "(Tensor, optional), The Code on each Node of the Path from root "
+             "to current word"
+             "it should have shape like [N, L], L is the length of the Path")
+        .AsDispensable();
     AddInput("Bias",
              "(Tensor, optional), The bias is a tensor with shape"
              "[1, num_classes - 1].");
 
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <iostream>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/clip_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
@@ -34,12 +35,21 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* w = ctx.Input<framework::Tensor>("W");
+    auto* path = ctx.Input<framework::Tensor>("PTable");
+    auto* code = ctx.Input<framework::Tensor>("PCode");
     auto* label = ctx.Input<framework::Tensor>("Label");
     auto* bias = ctx.Input<framework::Tensor>("Bias");
     auto* out = ctx.Output<framework::Tensor>("Out");
     auto* pre_out = ctx.Output<framework::Tensor>("PreOut");
     size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    int64_t code_length = math::FindLastSet(num_classes - 1);
+    bool is_custom = false;
+    if (path) {
+      is_custom = true;
+    } else {
+      is_custom = false;
+    }
+    int64_t code_length =
+        path ? path->dims()[1] : math::FindLastSet(num_classes - 1);
     int64_t batch_size = in->dims()[0];
     framework::Tensor sum;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
@@ -52,23 +62,31 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     zero(dev_ctx, pre_out, static_cast<T>(0.0));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     math::RowwiseSum<DeviceContext, T> row_sum;
-    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+    if (!is_custom) {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
+                                                       label->data<int64_t>()));
+    } else {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
+                                                       label->data<int64_t>()));
+    }
 
     std::vector<int64_t> sum_dims({batch_size, 1UL});
     sum.mutable_data<T>(framework::make_ddim(sum_dims), ctx.GetPlace());
     auto sum_mat = EigenMatrix<T>::From(sum);
     out->mutable_data<T>(ctx.GetPlace());
     auto out_mat = framework::EigenVector<T>::Flatten(*out);
     if (bias) {
-      bit_code.Add(pre_out, *bias);
+      bit_code->Add(pre_out, *bias);
     }
-    bit_code.Mul(pre_out, *w, *in);
+    bit_code->Mul(pre_out, *w, *in);
     // clip to [-40, 40]
     Transform<DeviceContext> trans;
     trans(ctx.template device_context<DeviceContext>(), pre_out_data,
           pre_out_data + pre_out->numel(), pre_out_data,
           ClipFunctor<T>(static_cast<T>(-40.0), static_cast<T>(40.0)));
-    bit_code.Sum(*pre_out, out, static_cast<T>(-1));
+    bit_code->Sum(*pre_out, out, static_cast<T>(-1));
     // use softrelu to calculate cross entropy
     pre_out_mat.device(place) = (static_cast<T>(1.0) + pre_out_mat.exp()).log();
     row_sum(dev_ctx, *pre_out, &sum);
@@ -86,6 +104,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* in = ctx.Input<framework::Tensor>("X");
     auto* w = ctx.Input<framework::Tensor>("W");
+    auto* path = ctx.Input<framework::Tensor>("PTable");
+    auto* code = ctx.Input<framework::Tensor>("PCode");
     auto* in_grad = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     auto* w_grad = ctx.Output<framework::Tensor>(framework::GradVarName("W"));
     auto* bias_grad =
@@ -105,7 +125,22 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     zero(dev_ctx, w_grad, static_cast<T>(0.0));
 
     size_t num_classes = static_cast<size_t>(ctx.Attr<int>("num_classes"));
-    math::MatrixBitCodeFunctor<T> bit_code(num_classes, label->data<int64_t>());
+
+    bool is_custom = false;
+    if (path) {
+      is_custom = true;
+    } else {
+      is_custom = false;
+    }
+
+    std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
+    if (!is_custom) {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(num_classes,
+                                                       label->data<int64_t>()));
+    } else {
+      bit_code.reset(new math::MatrixBitCodeFunctor<T>(path, code,
+                                                       label->data<int64_t>()));
+    }
 
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
     auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
@@ -116,18 +151,18 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     // softrelu derivative
     pre_out_grad_mat.device(place) =
         static_cast<T>(1.0) - static_cast<T>(1.0) / pre_out_mat.exp();
-    bit_code.Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
+    bit_code->Sub(&pre_out_grad);  // the gradient of clip(w * x + b)
     pre_out_grad_mat.device(place) =
         pre_out_grad_mat * out_grad_mat.broadcast(bcast);
     // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
     // be consistent with the clipping in forward.
     if (bias_grad) {
       bias_grad->mutable_data<T>(ctx.GetPlace());
       zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-      bit_code.AddGrad(pre_out_grad, bias_grad);
+      bit_code->AddGrad(pre_out_grad, bias_grad);
     }
-    bit_code.MulGradWeight(pre_out_grad, w_grad, *in);
-    bit_code.MulGradError(pre_out_grad, *w, in_grad);
+    bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
+    bit_code->MulGradError(pre_out_grad, *w, in_grad);
   }
 };
 
 
@@ -21,14 +21,13 @@ namespace math {
 template <typename T>
 void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
                                   const framework::Tensor& vec) {
-  SimpleCodeTable code_table(num_classes_);
   size_t batch_size = tmat->dims()[0];
   size_t width = tmat->dims()[1];
   for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
       tmat->data<T>()[i * width + j] += vec.data<T>()[index];
     }
   }
@@ -37,14 +36,13 @@ void MatrixBitCodeFunctor<T>::Add(framework::Tensor* tmat,
 template <typename T>
 void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
                                       framework::Tensor* vec) {
-  SimpleCodeTable code_table(num_classes_);
   size_t batch_size = tmat.dims()[0];
   size_t width = tmat.dims()[1];
   for (size_t i = 0; i < batch_size; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
       vec->data<T>()[index] += tmat.data<T>()[i * width + j];
     }
   }
@@ -53,15 +51,14 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::Tensor& tmat,
 template <typename T>
 void MatrixBitCodeFunctor<T>::Sum(const framework::Tensor& tmat,
                                   framework::Tensor* sum, T scale_sum) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat.dims()[0];
   size_t o_width = tmat.dims()[1];
   for (size_t i = 0; i < num_samples; ++i) {
     T sm = static_cast<T>(0.0);
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      if (code.calc_bit(j)) {
+      if (code->calc_bit(j)) {
         // calc_bit starts from right most bit, while data in tmat[i] is in the
         // reverse order.
         sm += tmat.data<T>()[i * o_width + j];
@@ -75,7 +72,6 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
                                   const framework::Tensor& weight,
                                   const framework::Tensor& input) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat->dims()[0];
   size_t tmat_width = tmat->dims()[1];
   size_t input_width = input.dims()[1];
@@ -84,10 +80,10 @@ void MatrixBitCodeFunctor<T>::Mul(framework::Tensor* tmat,
   auto weight_value = weight.data<T>();
   auto input_value = input.data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
       T sum = static_cast<T>(0.0);
       for (size_t k = 0; k < input_width; ++k) {
         sum += weight_value[weight_width * index + k] *
@@ -102,7 +98,6 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
                                             framework::Tensor* weight,
                                             const framework::Tensor& input) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat.dims()[0];
   size_t input_width = input.dims()[1];
   size_t tmat_width = tmat.dims()[1];
@@ -111,10 +106,10 @@ void MatrixBitCodeFunctor<T>::MulGradWeight(const framework::Tensor& tmat,
   auto weight_value = weight->data<T>();
   auto input_value = input.data<T>();
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
 
       for (size_t k = 0; k < input_width; ++k) {
         weight_value[weight_width * index + k] +=
@@ -128,7 +123,6 @@ template <typename T>
 void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
                                            const framework::Tensor& weight,
                                            framework::Tensor* input) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat.dims()[0];
   size_t tmat_width = tmat.dims()[1];
   size_t input_width = input->dims()[1];
@@ -138,10 +132,10 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
   auto input_value = input->data<T>();
 
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      size_t index = code.calc_index(j);
+      size_t index = code->calc_index(j);
 
       for (size_t k = 0; k < input_width; ++k) {
         input_value[input_width * i + k] +=
@@ -154,14 +148,13 @@ void MatrixBitCodeFunctor<T>::MulGradError(const framework::Tensor& tmat,
 
 template <typename T>
 void MatrixBitCodeFunctor<T>::Sub(framework::Tensor* tmat) {
-  SimpleCodeTable code_table(num_classes_);
   size_t num_samples = tmat->dims()[0];
   size_t o_width = tmat->dims()[1];
   for (size_t i = 0; i < num_samples; ++i) {
-    auto code = code_table(static_cast<size_t>(ids_[i]));
-    int code_length = code.get_length();
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
     for (int j = 0; j < code_length; ++j) {
-      if (code.calc_bit(j)) {
+      if (code->calc_bit(j)) {
         tmat->data<T>()[i * o_width + j] -= 1;
       }
     }