add sparsed bias grad, test=develop

JiabinYang · JiabinYang · commit 02d68051db17 · 2018-11-26T03:09:12.000Z
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc
@@ -107,8 +107,9 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
         "it should have shape like [N, L], L is the length of the Path")
         .AsDispensable();
     AddInput("Bias",
-             "(LoDTensor, optional), The bias is a tensor with shape"
-             "[1, num_classes - 1].");
+             "(LoDTensor, optional), The bias is a tensor with shape or "
+             "[non_leaf_num, 1]"
+             "[num_classes - 1, 1].");
     AddOutput(
         "Out",
         "(LoDTensor, required) The output of hierarchical sigmoid operator."
@@ -148,11 +149,11 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
                    "Output(W@Grad should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")),
                    "Output(X@Grad should not be null.");
-    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
-      ctx->SetOutputDim(framework::GradVarName("Bias"),
-                        ctx->GetInputDim("Bias"));
-    }
     if (!ctx->Attrs().Get<bool>("is_sparse")) {
+      if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+        ctx->SetOutputDim(framework::GradVarName("Bias"),
+                          ctx->GetInputDim("Bias"));
+      }
       ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
     }
     ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
@@ -172,20 +173,31 @@ class HierarchicalSigmoidGradOpGradVarTypeInference
  public:
   void operator()(const framework::OpDesc& op_desc,
                   framework::BlockDesc* block) const override {
-    auto out_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto out_W_var_name = op_desc.Output(framework::GradVarName("W")).front();
+    auto out_Bias_var_name =
+        op_desc.Output(framework::GradVarName("Bias")).front();
     auto attr = op_desc.GetAttr("is_sparse");
     bool is_sparse = boost::get<bool>(attr);
     if (is_sparse) {
       VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
               << " is set to SelectedRows";
-      block->Var(out_var_name)
+      block->Var(out_W_var_name)
+          ->SetType(framework::proto::VarType::SELECTED_ROWS);
+      VLOG(3) << "hierarchical_sigmoid_grad op "
+              << framework::GradVarName("Bias") << " is set to SelectedRows";
+      block->Var(out_Bias_var_name)
           ->SetType(framework::proto::VarType::SELECTED_ROWS);
     } else {
       VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W")
               << " is set to LoDTensor";
-      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
+      block->Var(out_W_var_name)
+          ->SetType(framework::proto::VarType::LOD_TENSOR);
+      VLOG(3) << "hierarchical_sigmoid_grad op "
+              << framework::GradVarName("Bias") << " is set to SelectedRows";
+      block->Var(out_Bias_var_name)
+          ->SetType(framework::proto::VarType::LOD_TENSOR);
     }
-    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
+    block->Var(out_W_var_name)->SetDataType(block->Var("W")->GetDataType());
   }
 };
 
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -124,13 +124,12 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     auto* w = ctx.Input<framework::LoDTensor>("W");
     auto* path = ctx.Input<framework::LoDTensor>("PTable");
     auto* code = ctx.Input<framework::LoDTensor>("PCode");
+    auto* bias = ctx.Input<framework::LoDTensor>("Bias");
     auto* in_grad =
         ctx.Output<framework::LoDTensor>(framework::GradVarName("X"));
     bool is_sparse = ctx.Attr<bool>("is_sparse");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     math::SetConstant<DeviceContext, T> zero;
-    auto* bias_grad =
-        ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
     auto* label = ctx.Input<framework::LoDTensor>("Label");
     auto* pre_out = ctx.Input<framework::LoDTensor>("PreOut");
     auto* out_grad =
@@ -174,12 +173,15 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
         pre_out_grad_mat * out_grad_mat.broadcast(bcast);
     // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to
     // be consistent with the clipping in forward.
-    if (bias_grad) {
-      bias_grad->mutable_data<T>(ctx.GetPlace());
-      zero(dev_ctx, bias_grad, static_cast<T>(0.0));
-      bit_code->AddGrad(pre_out_grad, bias_grad);
-    }
+
     if (!is_sparse) {
+      auto* bias_grad =
+          ctx.Output<framework::LoDTensor>(framework::GradVarName("Bias"));
+      if (bias_grad) {
+        bias_grad->mutable_data<T>(ctx.GetPlace());
+        zero(dev_ctx, bias_grad, static_cast<T>(0.0));
+        bit_code->AddGrad(pre_out_grad, bias_grad);
+      }
       auto* w_grad =
           ctx.Output<framework::LoDTensor>(framework::GradVarName("W"));
       w_grad->mutable_data<T>(ctx.GetPlace());
@@ -199,6 +201,21 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
 
       w_grad_value->mutable_data<T>(temp_dim, ctx.GetPlace());
       zero(dev_ctx, w_grad_value, static_cast<T>(0.0));
+      auto* bias_grad =
+          ctx.Output<framework::SelectedRows>(framework::GradVarName("Bias"));
+      if (bias_grad) {
+        bias_grad->set_rows(real_rows);
+        // build ids -> rows index map
+        bias_grad->SyncIndex();
+        bias_grad->set_height(bias->dims()[0]);
+        auto* bias_grad_value = bias_grad->mutable_value();
+        std::vector<int64_t> dims = {static_cast<int64_t>(real_rows.size()),
+                                     bias->dims()[1]};
+        bias_grad_value->mutable_data<T>(framework::make_ddim(dims),
+                                         ctx.GetPlace());
+        zero(dev_ctx, bias_grad_value, static_cast<T>(0.0));
+        bit_code->AddGrad(pre_out_grad, bias_grad);
+      }
       bit_code->MulGradWeight(pre_out_grad, w_grad, *in);
     }
     bit_code->MulGradError(pre_out_grad, *w, in_grad);
diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -48,6 +48,24 @@ void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
   }
 }
 
+template <typename T>
+void MatrixBitCodeFunctor<T>::AddGrad(const framework::LoDTensor& tmat,
+                                      framework::SelectedRows* vec) {
+  size_t batch_size = tmat.dims()[0];
+  size_t width = tmat.dims()[1];
+  for (size_t i = 0; i < batch_size; ++i) {
+    auto code = code_table->get_code(i);
+    int code_length = code->get_length();
+    for (int j = 0; j < code_length; ++j) {
+      size_t index = code->calc_index(j);
+      int64_t row_index =
+          vec->AutoGrownIndex(static_cast<int64_t>(index), false, true);
+      vec->mutable_value()->data<T>()[row_index] +=
+          tmat.data<T>()[i * width + j];
+    }
+  }
+}
+
 template <typename T>
 void MatrixBitCodeFunctor<T>::Sum(const framework::LoDTensor& tmat,
                                   framework::LoDTensor* sum, T scale_sum) {
diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -241,6 +241,11 @@ class MatrixBitCodeFunctor {
   */
   void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec);
 
+  /* For selected rows For j < code_length
+       vec(0, index(i, j)) += tmat(i, j)
+  */
+  void AddGrad(const framework::LoDTensor& tmat, framework::SelectedRows* vec);
+
   /* For j < code_length
     sum(i, 0) = \sum_j bit(i, j) * tmat(i, j)
   */
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
@@ -4639,14 +4639,14 @@ def hsigmoid(input,
         if not is_costum:
             bias = helper.create_parameter(
                 attr=helper.bias_attr,
-                shape=[1, num_classes - 1],
+                shape=[num_classes - 1, 1],
                 is_bias=True,
                 dtype=input.dtype)
             inputs['Bias'] = bias
         else:
             bias = helper.create_parameter(
                 attr=helper.bias_attr,
-                shape=[1, non_leaf_num],
+                shape=[non_leaf_num, 1],
                 is_bias=True,
                 dtype=input.dtype)
             inputs['Bias'] = bias
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -77,7 +77,7 @@ def hsigmoid(x, w, label, bias, num_classes):
         length = code_table.get_length()
         for j in range(length):
             idx = code_table.cal_index(j)
-            pre_output[i][j] += bias[0][idx]
+            pre_output[i][j] += bias[idx][0]
     for i in range(batch_size):
         code_table = CodeTable(num_classes, label[i])
         length = code_table.get_length()
@@ -115,7 +115,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes):
         length = code_table.get_length()
         for j in range(length):
             idx = code_table.cal_index(j)
-            pre_output[i][j] += bias[0][idx]
+            pre_output[i][j] += bias[idx][0]
     for i in range(batch_size):
         code_table = CodeTableWithCustomTree(ptable, pcode, i)
         length = code_table.get_length()
@@ -150,7 +150,7 @@ def setUp(self):
         w = np.random.random(
             (num_classes - 1, feature_size)).astype("float32") * 2
         label = np.random.randint(0, num_classes, (batch_size, 1))
-        bias = np.random.random((1, num_classes - 1)).astype("float32")
+        bias = np.random.random((num_classes - 1, 1)).astype("float32")
         self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias}
         pre_output, out = hsigmoid(x, w, label, bias, num_classes)
@@ -178,7 +178,7 @@ def setUp(self):
               -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
         pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
             1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
-        bias = np.random.random((1, num_classes - 1)).astype("float32")
+        bias = np.random.random((num_classes - 1, 1)).astype("float32")
         self.attrs = {'num_classes': num_classes, 'is_sparse': True}
         self.inputs = {
             'X': x,
@@ -193,7 +193,6 @@ def setUp(self):
         self.outputs = {'PreOut': pre_output, 'Out': out}
 
     def test_check_output(self):
-        print("checking output in CostumTree")
         self.check_output()
 
 
@@ -208,7 +207,7 @@ def hs_net_conf(self, is_sparse):
 
         emb = fluid.layers.embedding(
             input=input_word,
-            is_sparse=False,
+            is_sparse=is_sparse,
             size=[3, 3],
             param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal(
                 scale=1 / math.sqrt(3))))
@@ -220,6 +219,7 @@ def hs_net_conf(self, is_sparse):
             ptable=ptable,
             pcode=pcode,
             is_costum=True,
+            bias_attr=True,
             is_sparse=is_sparse)
 
         avg_cost = fluid.layers.reduce_mean(cost)
@@ -240,7 +240,6 @@ def training_test(self, is_sparse):
             optimizer.minimize(loss)
 
             main_program = fluid.default_main_program()
-            # print("main program: {program}".format{program=str(main_program)})
             place = fluid.CPUPlace()
             feeder = fluid.DataFeeder(feed_list=data_list, place=place)
             exe = fluid.Executor(place)
@@ -279,7 +278,7 @@ def setUp(self):
               -1)])  #np.array to store 1,2,5,6s' non-leaf path(root -> leaf)
         pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), (
             1, 0, 0, -1, -1), (0, 1, -1, -1, -1)])  #np.array to store 
-        bias = np.random.random((1, num_classes - 1)).astype("float32")
+        bias = np.random.random((num_classes - 1, 1)).astype("float32")
         self.attrs = {'num_classes': num_classes, 'is_sparse': False}
         self.inputs = {
             'X': x,
@@ -294,11 +293,9 @@ def setUp(self):
         self.outputs = {'PreOut': pre_output, 'Out': out}
 
     def test_check_output(self):
-        print("checking output in CostumTree")
         self.check_output()
 
     def test_check_grad(self):
-        print("checking outputGrad in CostumTree")
         self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label'))