Skip to content

Commit 40c631e

Browse files
author
yi.wu
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_unitests
2 parents 44925eb + 0b3d7f1 commit 40c631e

File tree

11 files changed

+55
-15
lines changed

11 files changed

+55
-15
lines changed

cmake/external/openblas.cmake

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ IF(NOT ${CBLAS_FOUND})
2929
"${CBLAS_INSTALL_DIR}/lib/${CMAKE_STATIC_LIBRARY_PREFIX}openblas${CMAKE_STATIC_LIBRARY_SUFFIX}"
3030
CACHE FILEPATH "openblas library." FORCE)
3131

32+
ADD_DEFINITIONS(-DPADDLE_USE_OPENBLAS)
33+
3234
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
3335
SET(OPENBLAS_COMMIT "v0.2.20")
3436

paddle/contrib/tape/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ if(APPLE)
1717
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wno-error=pessimizing-move")
1818
endif(APPLE)
1919

20-
cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES})
20+
cc_library(tape_variable SRCS variable.cc DEPS ${FLUID_CORE_MODULES} device_context)
2121
cc_library(tape SRCS tape.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB} tape_variable)
2222

2323
cc_test(test_tape

paddle/fluid/framework/parallel_executor.cc

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -145,9 +145,9 @@ void ParallelExecutor::BCastParamsToGPUs(
145145
auto &dims = main_tensor.dims();
146146
if (paddle::platform::is_gpu_place(main_tensor.place())) {
147147
#ifdef PADDLE_WITH_CUDA
148+
std::vector<void *> buffers;
148149
size_t numel = main_tensor.numel();
149150
ncclDataType_t data_type = platform::ToNCCLDataType(main_tensor.type());
150-
platform::NCCLGroupGuard guard;
151151
for (size_t i = 0; i < member_->places_.size(); ++i) {
152152
auto place = member_->places_[i];
153153
void *buffer;
@@ -159,11 +159,21 @@ void ParallelExecutor::BCastParamsToGPUs(
159159
t->Resize(dims);
160160
buffer = t->mutable_data(place, main_tensor.type());
161161
}
162-
auto &nccl_ctx = member_->nccl_ctxs_->at(place);
163-
platform::dynload::ncclBcast(buffer, numel, data_type, 0,
164-
nccl_ctx.comm_, nccl_ctx.stream());
162+
buffers.push_back(buffer);
165163
}
166-
member_->nccl_ctxs_->WaitAll();
164+
165+
PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(),
166+
"variables' buffer size to bcast NOT equal to places");
167+
{
168+
platform::NCCLGroupGuard guard;
169+
for (size_t i = 0; i < member_->places_.size(); ++i) {
170+
auto &nccl_ctx = member_->nccl_ctxs_->at(member_->places_[i]);
171+
platform::dynload::ncclBcast(buffers[i], numel, data_type, 0,
172+
nccl_ctx.comm_, nccl_ctx.stream());
173+
}
174+
member_->nccl_ctxs_->WaitAll();
175+
}
176+
167177
#else
168178
PADDLE_THROW("Not compiled with CUDA");
169179
#endif

paddle/fluid/inference/io.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,16 +20,20 @@ limitations under the License. */
2020
#include "paddle/fluid/framework/block_desc.h"
2121
#include "paddle/fluid/framework/feed_fetch_type.h"
2222
#include "paddle/fluid/framework/op_registry.h"
23+
#include "paddle/fluid/operators/math/blas.h"
2324
#include "paddle/fluid/pybind/pybind.h"
2425

2526
DEFINE_string(devices, "", "The devices to be used which is joined by comma.");
2627
DEFINE_bool(init_p2p, false, "Whether to init p2p.");
28+
DEFINE_int32(math_num_threads, 1,
29+
"Number of threads used to run math functions.");
2730

2831
namespace paddle {
2932
namespace inference {
3033

3134
void Init(const std::vector<std::string> argv) {
3235
framework::InitGflags(argv);
36+
operators::math::SetNumThreads(FLAGS_math_num_threads);
3337
// init devices
3438
std::vector<int> devices;
3539
std::string token;

paddle/fluid/operators/detail/grpc_server.cc

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,7 +169,8 @@ class RequestPrefetch final : public RequestBase {
169169

170170
auto scope = request_->GetMutableLocalScope();
171171
auto invar = scope->FindVar(in_var_name);
172-
framework::Variable* outvar = scope->FindVar(out_var_name);
172+
// out var must be created in local scope!
173+
framework::Variable* outvar = scope->Var(out_var_name);
173174

174175
request_handler_->Handle(in_var_name, scope, invar, &outvar, out_var_name);
175176

paddle/fluid/operators/math/blas.h

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,13 +20,16 @@
2020
#ifdef PADDLE_WITH_MKLML
2121
#include <mkl_cblas.h>
2222
#include <mkl_lapacke.h>
23+
#include <mkl_service.h>
2324
#include <mkl_vml_functions.h>
2425
#endif
2526

2627
#ifdef PADDLE_USE_OPENBLAS
2728
#include <cblas.h>
29+
#ifdef LAPACK_FOUND
2830
#include <lapacke.h>
2931
#endif
32+
#endif
3033

3134
#ifndef LAPACK_FOUND
3235
extern "C" {
@@ -46,6 +49,18 @@ namespace paddle {
4649
namespace operators {
4750
namespace math {
4851

52+
static void SetNumThreads(int num_threads) {
53+
#ifdef PADDLE_USE_OPENBLAS
54+
int real_num_threads = num_threads > 1 ? num_threads : 1;
55+
openblas_set_num_threads(real_num_threads);
56+
#elif defined(PADDLE_WITH_MKLML)
57+
int real_num_threads = num_threads > 1 ? num_threads : 1;
58+
mkl_set_num_threads(real_num_threads);
59+
#else
60+
PADDLE_ENFORCE(false, "To be implemented.");
61+
#endif
62+
}
63+
4964
/**
5065
* Matrix Descriptor of a memory buffer.
5166
*

paddle/fluid/operators/math/math_function.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,10 @@ limitations under the License. */
2121

2222
#ifdef PADDLE_USE_OPENBLAS
2323
#include <cblas.h>
24+
#ifdef LAPACK_FOUND
2425
#include <lapacke.h>
2526
#endif
27+
#endif
2628

2729
#ifndef LAPACK_FOUND
2830
extern "C" {

paddle/fluid/platform/nccl_helper.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,11 @@ inline ncclDataType_t ToNCCLDataType(std::type_index type) {
4141
}
4242
}
4343

44+
// NOTE(minqiyang): according to the ncclGroupEnd documentations:
45+
// https://docs.nvidia.com/deeplearning/sdk/nccl-api/ncclapidoc.html,
46+
// ncclGroupEnd will wait for all communicators to be initialized, which will
47+
// cause blocking problem when a runtime_error was thrown, so try only guard
48+
// NCCL actions when use it.
4449
class NCCLGroupGuard {
4550
public:
4651
static std::mutex &NCCLMutex() {

paddle/math/MathFunctions.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1212
See the License for the specific language governing permissions and
1313
limitations under the License. */
1414

15-
#ifndef MATHFUNCTIONS_H_
16-
#define MATHFUNCTIONS_H_
15+
#pragma once
1716

1817
#ifdef PADDLE_WITH_MKLML
1918
#include <mkl_cblas.h>
2019
#include <mkl_lapacke.h>
2120
#include <mkl_vml_functions.h>
2221
#endif
2322

24-
#if defined(PADDLE_USE_VECLIB)
23+
#ifdef PADDLE_USE_VECLIB
2524
extern "C" {
2625
#include <cblas.h>
2726
#include <clapack.h>
@@ -30,8 +29,10 @@ extern "C" {
3029

3130
#ifdef PADDLE_USE_OPENBLAS
3231
#include <cblas.h>
32+
#ifdef LAPACK_FOUND
3333
#include <lapacke.h>
3434
#endif
35+
#endif
3536

3637
#ifndef LAPACK_FOUND
3738
extern "C" {
@@ -126,5 +127,3 @@ template <class T>
126127
void vTanh(const int n, const T* a, T* r);
127128

128129
} // namespace paddle
129-
130-
#endif // MATHFUNCTIONS_H_

python/paddle/fluid/layers/nn.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4242,7 +4242,7 @@ def random_crop(x, shape, seed=None):
42424242
seed_out = helper.create_tmp_variable(dtype="int64")
42434243
helper.append_op(
42444244
type="random_crop",
4245-
inputs={"X": input,
4245+
inputs={"X": x,
42464246
"Seed": seed},
42474247
outputs={"Out": out,
42484248
"SeedOut": seed_out},

0 commit comments

Comments
 (0)