PaddlePaddle
diff --git a/‎paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
Lines changed: 13 additions & 2 deletions b/‎paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc
Lines changed: 13 additions & 2 deletions
diff --git a/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
Lines changed: 11 additions & 4 deletions b/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
Lines changed: 11 additions & 4 deletions
diff --git a/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.h
Lines changed: 3 additions & 0 deletions b/‎paddle/fluid/framework/details/threaded_ssa_graph_executor.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/parallel_executor.cc
Lines changed: 7 additions & 3 deletions b/‎paddle/fluid/framework/parallel_executor.cc
Lines changed: 7 additions & 3 deletions
diff --git a/‎paddle/fluid/framework/parallel_executor.h
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/parallel_executor.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/reader.h
Lines changed: 4 additions & 4 deletions b/‎paddle/fluid/framework/reader.h
Lines changed: 4 additions & 4 deletions
diff --git a/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 3 additions & 0 deletions b/‎paddle/fluid/operators/CMakeLists.txt
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/hierarchical_sigmoid_op.cc
Lines changed: 167 additions & 0 deletions b/‎paddle/fluid/operators/hierarchical_sigmoid_op.cc
Lines changed: 167 additions & 0 deletions
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"
+#include <stdexcept>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/executor.h"
@@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       }
     }
   }
+  std::vector<framework::LoDTensor> fetch_data;
+  std::exception_ptr eptr;
+  try {
+    fetch_data = underlying_executor_->Run(fetch_tensors);
+  } catch (...) {
+    eptr = std::current_exception();
+  }
 
-  auto fetch_data = underlying_executor_->Run(fetch_tensors);
   drop_scope_counter_ += 1;
   if (!fetch_tensors.empty() ||
       drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {
@@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
       scope->DeleteScope(local_scope);
     }
   }
-  return fetch_data;
+  if (eptr) {
+    std::rethrow_exception(eptr);
+  } else {
+    return fetch_data;
+  }
 }
 }  // namespace details
 }  // namespace framework
 
@@ -78,6 +78,10 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     set.clear();
   };
 
+  // Clean run context
+  run_op_futures_.clear();
+  exception_.reset();
+
   // Step 3. Execution
   while (!pending_vars.empty()) {
     // 1. Run All Ready ops
@@ -96,16 +100,19 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
     auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
 
     if (timeout) {
-      std::lock_guard<std::mutex> l(exception_mu_);
+      std::unique_lock<std::mutex> l(exception_mu_);
       if (exception_) {
+        l.unlock();
+        for (auto &run_op_future : run_op_futures_) {
+          run_op_future.wait();
+        }
+        l.lock();
         std::exception *exp = exception_.get();
         if (dynamic_cast<platform::EOFException *>(exp)) {
           auto e = *static_cast<platform::EOFException *>(exp);
-          exception_.reset();
           throw e;
         } else if (dynamic_cast<platform::EnforceNotMet *>(exp)) {
           auto e = *static_cast<platform::EnforceNotMet *>(exp);
-          exception_.reset();
           throw e;
         } else {
           LOG(FATAL) << "Unknown exception.";
@@ -222,7 +229,7 @@ void ThreadedSSAGraphExecutor::RunOp(
     }
   };
   if (pool_) {
-    pool_->enqueue(op_run);
+    run_op_futures_.emplace_back(pool_->enqueue(op_run));
   } else {
     op_run();
   }
 
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <deque>
+#include <list>
 #include <string>
 #include <unordered_set>
 #include <utility>
@@ -77,6 +78,8 @@ class ThreadedSSAGraphExecutor : public SSAGraphExecutor {
 
  private:
   ExecutionStrategy strategy_;
+  // use std::list because clear(), push_back, and for_each are O(1)
+  std::list<std::future<void>> run_op_futures_;
 };
 
 }  // namespace details
 
@@ -95,7 +95,7 @@ ParallelExecutor::ParallelExecutor(
   }
 
   if (member_->local_scopes_.size() != 1 && local_scopes.empty()) {
-    BCastParamsToGPUs(bcast_vars);
+    BCastParamsToDevs(bcast_vars);
   }
   // Startup Program has been run. All local scopes has correct parameters.
 
@@ -131,7 +131,7 @@ ParallelExecutor::ParallelExecutor(
       member_->places_, std::move(member_->executor_)));
 }
 
-void ParallelExecutor::BCastParamsToGPUs(
+void ParallelExecutor::BCastParamsToDevs(
     const std::unordered_set<std::string> &vars) const {
   // the the initializing bcast, all vars would be bcast from device(0),
   // otherwise
@@ -202,7 +202,11 @@ void ParallelExecutor::BCastParamsToGPUs(
 #endif
     } else {
       platform::CPUPlace cpu;
-      for (size_t i = 1; i < member_->places_.size(); ++i) {
+      for (size_t i = 0; i < member_->places_.size(); ++i) {
+        if ((initializing && i == 0) ||
+            (!initializing && static_cast<int>(i) == var_dev_id))
+          continue;
+
         auto local_scope = member_->local_scopes_[i];
         auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
         t->Resize(dims);
 
@@ -66,7 +66,7 @@ class ParallelExecutor {
   void Run(const std::vector<std::string> &fetch_tensors,
            const std::string &fetched_var_name);
 
-  void BCastParamsToGPUs(const std::unordered_set<std::string> &vars) const;
+  void BCastParamsToDevs(const std::unordered_set<std::string> &vars) const;
 
  private:
   ParallelExecutorPrivate *member_;
 
@@ -29,11 +29,11 @@ enum ReaderStatus { kRunning, kStopped };
 
 class ReaderBase {
  public:
-  void ReadNext(std::vector<LoDTensor>* out);
+  virtual void ReadNext(std::vector<LoDTensor>* out);
 
-  void Shutdown();
+  virtual void Shutdown();
 
-  void Start();
+  virtual void Start();
 
   // Return the readers which are the end of decorating chain. Basically
   // they are readers just before read op.
@@ -42,7 +42,7 @@ class ReaderBase {
   virtual ~ReaderBase();
 
  protected:
-  virtual void ReadNextImpl(std::vector<LoDTensor>* out) = 0;
+  virtual void ReadNextImpl(std::vector<LoDTensor>* out) {}
 
   virtual void ShutdownImpl() {}
 
 
@@ -259,12 +259,15 @@ op_library(max_sequence_len_op DEPS lod_rank_table)
 op_library(sequence_conv_op DEPS context_project)
 op_library(sequence_pool_op DEPS sequence_pooling)
 op_library(lstm_op DEPS sequence2batch lstm_compute)
+op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
 op_library(lstmp_op DEPS sequence2batch lstm_compute)
 op_library(gru_op DEPS sequence2batch gru_compute)
 op_library(recurrent_op DEPS executor)
 op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
+op_library(unsqueeze_op DEPS reshape_op)
+op_library(squeeze_op DEPS reshape_op)
 
 if (WITH_GPU)
     op_library(conv_op DEPS vol2col depthwise_conv im2col)
 
@@ -0,0 +1,167 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/hierarchical_sigmoid_op.h"
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+/**
+ * Organize the classes into a binary tree. At each node, a sigmoid function
+ * is used to calculate the probability of belonging to the right branch.
+ * This idea is from "F. Morin, Y. Bengio (AISTATS 05):
+ * Hierarchical Probabilistic Neural Network Language Model."
+ *
+ * Here we uses a simple way of making the binary tree.
+ * Assuming the number of classes C = 6,
+ * The classes are organized as a binary tree in the following way:
+ *
+ * @code{.py}
+ * *-*-*- 2
+ * | | |- 3
+ * | |
+ * | |-*- 4
+ * |   |- 5
+ * |
+ * |-*- 0
+ *   |- 1
+ * @endcode
+ *
+ * where * indicates an internal node, and each leaf node represents a class.
+ * - Node 0 ... C-2 are internal nodes.
+ * - Node C-1 ... 2C-2 are leaf nodes.
+ * - Class c is represented by leaf node \f$c+C-1\f$.
+ *
+ * We assign an id for each node:
+ * - the id of root be 0.
+ * - the left child of a node i is 2*i+1.
+ * - the right child of a node i is 2*i+2.
+ *
+ * It's easy to see that:
+ * - the parent of node i is \f$\left\lfloor(i-1)/2\right\rfloor\f$.
+ * - the j-th level ancestor of node i is
+ * \f$\left\lfloor(i+1)/2^{j+1}\right\rfloor - 1\f$.
+ * - A node i is a left child of its parent if \f$(i-1)\%2==0\f$.
+ *
+ */
+
+class HierarchicalSigmoidOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("PreOut"),
+                   "Output(PreOut) should not be null.");
+    const int64_t batch_size = ctx->GetInputDim("X")[0];
+    std::vector<int64_t> output_shape({batch_size, 1});
+    ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+template <typename AttrType>
+class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "(Tensor, required) The input tensor with shape [N, D], "
+             "where N is the size of mini-batch, and D is the feature size.");
+    AddInput("W",
+             "(Tensor, required), The parameters of hierarchical "
+             "sigmoid operator, each of them is a 2-D tensor, the shape is"
+             "[num_classes - 1, D].");
+    AddInput("Label",
+             "(Tensor, required), The labels of training data. It's a"
+             "tensor with shape [N, 1].");
+    AddInput("Bias",
+             "(Tensor, optional), The bias is a tensor with shape"
+             "[1, num_classes - 1].");
+    AddOutput("Out",
+              "(Tensor, required) The output of hierarchical sigmoid operator."
+              "The shape is [N, 1].");
+    AddOutput("PreOut",
+              "(Tensor, required) A intermedia 2-D tensor with shape "
+              "[batch_size, code_length], where code_length represents the "
+              "maximum path length from root to leaf nodes.")
+        .AsIntermediate();
+    AddAttr<AttrType>("num_classes", "(int, required), The number of classes")
+        .SetDefault(2);
+    AddComment(R"DOC(
+The hierarchical sigmoid operator organize the classes into a binary tree.
+At each node, a sigmoid function is used to calculate the probability of
+belonging to the right branch. This idea is from
+"F. Morin, Y. Bengio (AISTATS 05):
+Hierarchical Probabilistic Neural Network Language Model."
+      )DOC");
+  }
+};
+
+class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("PreOut"),
+                   "Input(Preout) should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")),
+                   "Output(W@Grad should not be null.)");
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")));
+    if (ctx->HasOutput(framework::GradVarName("Bias"))) {
+      ctx->SetOutputDim(framework::GradVarName("Bias"),
+                        ctx->GetInputDim("Bias"));
+    }
+    ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W"));
+    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
+        ctx.GetPlace());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp,
+                  ops::HierarchicalSigmoidOpMaker<int>,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext, float>,
+    ops::HierarchicalSigmoidOpKernel<paddle::platform::CPUDeviceContext,
+                                     double>);
+REGISTER_OP_CPU_KERNEL(
+    hierarchical_sigmoid_grad,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         float>,
+    ops::HierarchicalSigmoidGradOpKernel<paddle::platform::CPUDeviceContext,
+                                         double>);
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`	`// limitations under the License.`
`14`	`14`
`15`	`15`	`#include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h"`
	`16`	`+#include <stdexcept>`
`16`	`17`	`#include <string>`
`17`	`18`	`#include <vector>`
`18`	`19`	`#include "paddle/fluid/framework/executor.h"`
`@@ -53,8 +54,14 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(`
`53`	`54`	`}`
`54`	`55`	`}`
`55`	`56`	`}`
	`57`	`+ std::vector<framework::LoDTensor> fetch_data;`
	`58`	`+ std::exception_ptr eptr;`
	`59`	`+ try {`
	`60`	`+ fetch_data = underlying_executor_->Run(fetch_tensors);`
	`61`	`+ } catch (...) {`
	`62`	`+ eptr = std::current_exception();`
	`63`	`+ }`
`56`	`64`
`57`		`- auto fetch_data = underlying_executor_->Run(fetch_tensors);`
`58`	`65`	`drop_scope_counter_ += 1;`
`59`	`66`	`if (!fetch_tensors.empty() \|\|`
`60`	`67`	`drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {`
`@@ -69,7 +76,11 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(`
`69`	`76`	`scope->DeleteScope(local_scope);`
`70`	`77`	`}`
`71`	`78`	`}`
`72`		`- return fetch_data;`
	`79`	`+ if (eptr) {`
	`80`	`+ std::rethrow_exception(eptr);`
	`81`	`+ } else {`
	`82`	`+ return fetch_data;`
	`83`	`+ }`
`73`	`84`	`}`
`74`	`85`	`} // namespace details`
`75`	`86`	`} // namespace framework`