Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_unitests

yi.wu · yi.wu · commit 40c631e55465 · 2018-06-14T17:38:30.000+08:00
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
         "${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
         CACHE FILEPATH "openblas library." FORCE)
 
+    ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
+
     SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
     SET(OPENBLAS_COMMIT "v0.2.20")
 
diff --git a/paddle/contrib/tape/CMakeLists.txt b/paddle/contrib/tape/CMakeLists.txt
@@ -17,7 +17,7 @@ if(APPLE)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
 endif(APPLE)
 
-cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
+cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context)
 cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
 
 cc_test(test_tape
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
     auto &dims = main_tensor.dims();
     if (paddle::platform::is_gpu_place(main_tensor.place())) {
 #ifdef PADDLE_WITH_CUDA
+      std::vector<void *> buffers;
       size_t numel = main_tensor.numel();
       ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
-      platform::NCCLGroupGuard guard;
       for (size_t i = 0; i < member_->places_.size(); ++i) {
         auto place = member_->places_[i];
         void *buffer;
@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
           t->Resize(dims);
           buffer = t->mutable_data(place, main_tensor.type());
         }
-        auto &nccl_ctx = member_->nccl_ctxs_->at(place);
-        platform::dynload::ncclBcast(buffer, numel, data_type, 0,
-                                     nccl_ctx.comm_, nccl_ctx.stream());
+        buffers.push_back(buffer);
       }
-      member_->nccl_ctxs_->WaitAll();
+
+      PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
+                        "variables' buffer size to bcast NOT equal to places");
+      {
+        platform::NCCLGroupGuard guard;
+        for (size_t i = 0; i < member_->places_.size(); ++i) {
+          auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
+          platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
+                                       nccl_ctx.comm_, nccl_ctx.stream());
+        }
+        member_->nccl_ctxs_->WaitAll();
+      }
+
 #else
       PADDLE_THROW("Not compiled with CUDA");
 #endif
diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc
@@ -20,16 +20,20 @@ limitations under the License. */
 #include "paddle/fluid/framework/block_desc.h"
 #include "paddle/fluid/framework/feed_fetch_type.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/pybind/pybind.h"
 
 DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
 DEFINE_bool(init_p2p, false, "Whether to init p2p.");
+DEFINE_int32(math_num_threads, 1,
+             "Number of threads used to run math functions.");
 
 namespace paddle {
 namespace inference {
 
 void Init(const std::vector<std::string> argv) {
   framework::InitGflags(argv);
+  operators::math::SetNumThreads(FLAGS_math_num_threads);
   // init devices
   std::vector<int> devices;
   std::string token;
diff --git a/paddle/fluid/operators/detail/grpc_server.cc b/paddle/fluid/operators/detail/grpc_server.cc
@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase {
 
     auto scope = request_->GetMutableLocalScope();
     auto invar = scope->FindVar(in_var_name);
-    framework::Variable* outvar = scope->FindVar(out_var_name);
+    // out var must be created in local scope!
+    framework::Variable* outvar = scope->Var(out_var_name);
 
     request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
 
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
@@ -20,13 +20,16 @@
 #ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
+#include <mkl_service.h>
 #include <mkl_vml_functions.h>
 #endif
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif
 
 #ifndef LAPACK_FOUND
 extern "C" {
@@ -46,6 +49,18 @@ namespace paddle {
 namespace operators {
 namespace math {
 
+static void SetNumThreads(int num_threads) {
+#ifdef PADDLE_USE_OPENBLAS
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  openblas_set_num_threads(real_num_threads);
+#elif defined(PADDLE_WITH_MKLML)
+  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  mkl_set_num_threads(real_num_threads);
+#else
+  PADDLE_ENFORCE(false, "To be implemented.");
+#endif
+}
+
 /**
  * Matrix Descriptor of a memory buffer.
  *
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
@@ -21,8 +21,10 @@ limitations under the License. */
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif
 
 #ifndef LAPACK_FOUND
 extern "C" {
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
   }
 }
 
+// NOTE(minqiyang): according to the ncclGroupEnd documentations:
+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
+// ncclGroupEnd will wait for all communicators to be initialized, which will
+// cause blocking problem when a runtime_error was thrown, so try only guard
+// NCCL actions when use it.
 class NCCLGroupGuard {
  public:
   static std::mutex &NCCLMutex() {
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#ifndef MATHFUNCTIONS_H_
-#define MATHFUNCTIONS_H_
+#pragma once
 
 #ifdef PADDLE_WITH_MKLML
 #include <mkl_cblas.h>
 #include <mkl_lapacke.h>
 #include <mkl_vml_functions.h>
 #endif
 
-#if defined(PADDLE_USE_VECLIB)
+#ifdef PADDLE_USE_VECLIB
 extern "C" {
 #include <cblas.h>
 #include <clapack.h>
@@ -30,8 +29,10 @@ extern "C" {
 
 #ifdef PADDLE_USE_OPENBLAS
 #include <cblas.h>
+#ifdef LAPACK_FOUND
 #include <lapacke.h>
 #endif
+#endif
 
 #ifndef LAPACK_FOUND
 extern "C" {
@@ -126,5 +127,3 @@ template <class T>
 void vTanh(const int n, const T* a, T* r);
 
 }  // namespace paddle
-
-#endif  // MATHFUNCTIONS_H_
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
@@ -4242,7 +4242,7 @@ def random_crop(x, shape, seed=None):
     seed_out = helper.create_tmp_variable(dtype="int64")
     helper.append_op(
         type="random_crop",
-        inputs={"X": input,
+        inputs={"X": x,
                 "Seed": seed},
         outputs={"Out": out,
                  "SeedOut": seed_out},
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -41,12 +41,14 @@ function(py_test_modules TARGET_NAME)
 endfunction()
 list(REMOVE_ITEM TEST_OPS test_warpctc_op)
 list(REMOVE_ITEM TEST_OPS test_dist_train)
-#list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
-#list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_crf)
+list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_feed)
 # TODO(wuyi): this test hungs on CI, will add it back later
 list(REMOVE_ITEM TEST_OPS test_listen_and_serv_op)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_dist_train MODULES test_dist_train SERIAL)
+py_test_modules(test_parallel_executor_crf MODULES test_parallel_executor_crf SERIAL)
+py_test_modules(test_parallel_executor_fetch_feed MODULES test_parallel_executor_fetch_feed SERIAL)

Original file line number	Diff line number	Diff line change
`@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {`
`41`	`41`	`}`
`42`	`42`	`}`
`43`	`43`
	`44`	`+// NOTE(minqiyang): according to the ncclGroupEnd documentations:`
	`45`	`+// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,`
	`46`	`+// ncclGroupEnd will wait for all communicators to be initialized, which will`
	`47`	`+// cause blocking problem when a runtime_error was thrown, so try only guard`
	`48`	`+// NCCL actions when use it.`
`44`	`49`	`class NCCLGroupGuard {`
`45`	`50`	`public:`
`46`	`51`	`static std::mutex &NCCLMutex() {`