PaddlePaddle
diff --git a/‎cmake/configure.cmake
Lines changed: 4 additions & 0 deletions b/‎cmake/configure.cmake
Lines changed: 4 additions & 0 deletions
diff --git a/‎doc/survey/dynamic_graph.md
Lines changed: 378 additions & 0 deletions b/‎doc/survey/dynamic_graph.md
Lines changed: 378 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/CMakeLists.txt
Lines changed: 7 additions & 2 deletions b/‎paddle/fluid/framework/CMakeLists.txt
Lines changed: 7 additions & 2 deletions
diff --git a/‎paddle/fluid/framework/details/ssa_graph_checker.h
Lines changed: 1 addition & 1 deletion b/‎paddle/fluid/framework/details/ssa_graph_checker.h
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/fluid/framework/executor.cc
Lines changed: 11 additions & 0 deletions b/‎paddle/fluid/framework/executor.cc
Lines changed: 11 additions & 0 deletions
diff --git a/‎paddle/fluid/framework/executor.h
Lines changed: 7 additions & 0 deletions b/‎paddle/fluid/framework/executor.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/detail/grpc_client.cc
Lines changed: 19 additions & 0 deletions b/‎paddle/fluid/operators/detail/grpc_client.cc
Lines changed: 19 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/detail/grpc_client.h
Lines changed: 5 additions & 0 deletions b/‎paddle/fluid/operators/detail/grpc_client.h
Lines changed: 5 additions & 0 deletions
diff --git a/‎paddle/fluid/operators/detail/grpc_server.cc
Lines changed: 9 additions & 7 deletions b/‎paddle/fluid/operators/detail/grpc_server.cc
Lines changed: 9 additions & 7 deletions
diff --git a/‎paddle/fluid/operators/detail/request_handler.h
Lines changed: 14 additions & 8 deletions b/‎paddle/fluid/operators/detail/request_handler.h
Lines changed: 14 additions & 8 deletions
@@ -118,6 +118,10 @@ endif()
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
+if(WITH_DISTRIBUTE)
+  add_definitions(-DPADDLE_WITH_DISTRIBUTE)
+endif()
+
 if(WITH_GOLANG)
   # we need to symlink Paddle directory into GOPATH. If we
   # don't do it and we have code that depends on Paddle, go
 
@@ -83,8 +83,13 @@ cc_library(lod_rank_table SRCS lod_rank_table.cc DEPS lod_tensor)
 
 cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glog)
 
-cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
-framework_proto glog lod_rank_table feed_fetch_method)
+if(WITH_DISTRIBUTE)
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr)
+  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
+  set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
+else()
+  cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
+endif()
 
 
 cc_library(parallel_executor SRCS parallel_executor.cc DEPS ssa_graph_builder_factory threaded_ssa_graph_executor scope_buffered_ssa_graph_executor)
 
@@ -19,7 +19,7 @@
 namespace paddle {
 namespace framework {
 namespace details {
-class SSAGraph;
+struct SSAGraph;
 
 class SSAGraghBuilderWithChecker : public SSAGraphBuilder {
  public:
 
@@ -20,6 +20,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/reader.h"
+#ifdef PADDLE_WITH_DISTRIBUTE
+#include "paddle/fluid/operators/detail/grpc_client.h"
+#endif
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -44,6 +47,14 @@ ExecutorPrepareContext::~ExecutorPrepareContext() {
 
 Executor::Executor(const platform::Place& place) : place_(place) {}
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+void Executor::Complete() {
+  ::paddle::operators::detail::RPCClient::GetInstance<
+      ::paddle::operators::detail::GRPCClient>()
+      ->SendComplete();
+}
+#endif
+
 void InitializeVariable(Variable* var, proto::VarType::Type var_type) {
   if (var_type == proto::VarType::LOD_TENSOR) {
     var->GetMutable<LoDTensor>();
 
@@ -44,6 +44,13 @@ class Executor {
 
   explicit Executor(const platform::Place& place);
 
+#ifdef PADDLE_WITH_DISTRIBUTE
+  /*
+   * Sending signal to pserver to mark current trainer stop.
+   */
+  void Complete();
+#endif
+
   /* @Brief
    * Runtime evaluation of the given ProgramDesc under certain Scope
    *
 
@@ -34,6 +34,12 @@ void GRPCClient::InitEventLoop() {
   client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this)));
 }
 
+void GRPCClient::SendComplete() {
+  for (auto& it : channels_) {
+    this->AsyncSendComplete(it.first);
+  }
+}
+
 GRPCClient::~GRPCClient() {
   Wait();
   cq_.Shutdown();
@@ -210,6 +216,19 @@ void GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
   req_count_++;
 }
 
+void GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) {
+  const auto ch = GetChannel(ep);
+
+  BatchBarrierProcessor* s = new BatchBarrierProcessor(ch);
+  s->Prepare(time_out);
+
+  sendrecv::VariableMessage req;
+  req.set_varname(COMPLETE_MESSAGE);
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
+  req_count_++;
+}
+
 void GRPCClient::Wait() {
   std::unique_lock<std::mutex> lk(sync_mutex_);
   sync_cond_.wait(lk, [this] { return req_count_ == 0; });
 
@@ -195,6 +195,8 @@ class GRPCClient : public RPCClient {
 
   void Wait() override;
 
+  void SendComplete() override;
+
  protected:
   void InitImpl() override;
 
@@ -204,6 +206,9 @@ class GRPCClient : public RPCClient {
 
   void Proceed();
 
+  void AsyncSendComplete(const std::string& ep,
+                         int64_t time_out = RPCClient::rpc_time_out);
+
   std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
 
  private:
 
@@ -162,16 +162,18 @@ class RequestPrefetch final : public RequestBase {
 
   void Process() override {
     // prefetch process...
-    std::string varname = request_->OutVarname();
-    VLOG(3) << "RequestPrefetch " << varname;
+    std::string in_var_name = request_->Varname();
+    std::string out_var_name = request_->OutVarname();
+    VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name
+            << " out_var_name: " << out_var_name;
 
     auto scope = request_->GetMutableLocalScope();
-    auto invar = scope->FindVar(varname);
-    framework::Variable* outvar = nullptr;
+    auto invar = scope->FindVar(in_var_name);
+    framework::Variable* outvar = scope->FindVar(out_var_name);
 
-    request_handler_->Handle(varname, scope, invar, &outvar);
+    request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
 
-    SerializeToByteBuffer(varname, outvar, *request_handler_->dev_ctx(),
+    SerializeToByteBuffer(out_var_name, outvar, *request_handler_->dev_ctx(),
                           &reply_);
     Finish(reply_, &responder_);
   }
@@ -287,7 +289,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name,
   } else if (rpc_name == kRequestPrefetch) {
     b = new RequestPrefetch(&service_, cq.get(), handler, req_id);
   } else {
-    PADDLE_ENFORCE(false, "not surpported rpc");
+    PADDLE_ENFORCE(false, "not supported rpc");
   }
 
   reqs[req_id] = b;
 
@@ -40,6 +40,7 @@ constexpr char kRequestPrefetch[] = "RequestPrefetch";
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
 #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
 #define FETCH_BARRIER_MESSAGE "FETCH_BARRIER@RECV"
+#define COMPLETE_MESSAGE "COMPLETE@RECV"
 
 class RPCServer;
 
@@ -60,9 +61,12 @@ class RequestHandler {
   void SetDevCtx(const platform::DeviceContext* dev_ctx) { dev_ctx_ = dev_ctx; }
   void SetProgram(framework::ProgramDesc* program) { program_ = program; }
   void SetExecutor(framework::Executor* executor) { executor_ = executor; }
+
+  // Used for dist lookup table prefetch
   void SetPrefetchPreparedCtx(
-      std::unique_ptr<framework::ExecutorPrepareContext> prepared) {
-    prefetch_ctx_.reset(prepared.release());
+      std::unordered_map<
+          std::string, std::shared_ptr<framework::ExecutorPrepareContext>>* g) {
+    prefetch_var_name_to_prepared_ctx_ = g;
   }
 
   // Used for async.
@@ -78,9 +82,6 @@ class RequestHandler {
   bool sync_mode() { return sync_mode_; }
   framework::Scope* scope() { return scope_; }
   const platform::DeviceContext* dev_ctx() { return dev_ctx_; }
-  framework::ExecutorPrepareContext* prefetch_ctx() {
-    return prefetch_ctx_.get();
-  }
   framework::ProgramDesc* program() { return program_; }
   framework::Executor* executor() { return executor_; }
 
@@ -99,8 +100,8 @@ class RequestHandler {
   //           *request_handler_->dev_ctx(), &reply_);
   //    }
   virtual bool Handle(const std::string& varname, framework::Scope* scope,
-                      framework::Variable* var,
-                      framework::Variable** outvar) = 0;
+                      framework::Variable* var, framework::Variable** outvar,
+                      const std::string& out_var_name = "") = 0;
 
  protected:
   const bool sync_mode_;
@@ -109,12 +110,17 @@ class RequestHandler {
   framework::Executor* executor_;
   framework::Scope* scope_;
   framework::ProgramDesc* program_;
-  std::unique_ptr<framework::ExecutorPrepareContext> prefetch_ctx_;
+
+  // used for distribute lookup table prefetch
+  std::unordered_map<std::string,
+                     std::shared_ptr<framework::ExecutorPrepareContext>>*
+      prefetch_var_name_to_prepared_ctx_;
 
   // Used for async.
   std::unordered_map<std::string,
                      std::shared_ptr<framework::ExecutorPrepareContext>>*
       grad_to_prepared_ctx_;
+
   RPCServer* rpc_server_;
 };