Skip to content

Commit b8975d6

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/combine_open_files_and_double_buffer
2 parents d36e13e + cebf7c6 commit b8975d6

27 files changed

+395
-122
lines changed

CMakeLists.txt

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,12 @@ option(WITH_ANAKIN "Compile with Anakin library" OFF)
6666
option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE})
6767
option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF)
6868
option(WITH_SYSTEM_BLAS "Use system blas library" OFF)
69+
option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION})
70+
71+
# PY_VERSION
72+
if(NOT PY_VERSION)
73+
set(PY_VERSION 2.7)
74+
endif()
6975

7076
# CMAKE_BUILD_TYPE
7177
if(NOT CMAKE_BUILD_TYPE)
@@ -146,6 +152,7 @@ endif()
146152
########################################################################################
147153

148154
include(external/mklml) # download mklml package
155+
include(external/libxsmm) # download, build, install libxsmm
149156
include(external/zlib) # download, build, install zlib
150157
include(external/gflags) # download, build, install gflags
151158
include(external/glog) # download, build, install glog
@@ -232,6 +239,10 @@ if(WITH_MKLML)
232239
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
233240
endif()
234241

242+
if(WITH_LIBXSMM)
243+
list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
244+
endif()
245+
235246
if(WITH_MKLDNN)
236247
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
237248
endif()

Dockerfile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,7 @@ RUN pip install pre-commit 'ipython==5.3.0' && \
8080
pip install opencv-python
8181

8282
#For docstring checker
83-
RUN pip install pylint pytest astroid isort
83+
RUN pip install pylint pytest astroid isort LinkChecker
8484

8585
COPY ./python/requirements.txt /root/
8686
RUN pip install -r /root/requirements.txt

cmake/external/libxsmm.cmake

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
OPTION(WITH_LIBXSMM "Compile with libxsmm" OFF)
17+
18+
IF(NOT WITH_LIBXSMM)
19+
return()
20+
ENDIF()
21+
22+
IF(WIN32 OR APPLE OR ANDROID OR IOS)
23+
MESSAGE(WARNING "Windows, Mac or Mobile are not supported with libxsmm in Paddle yet.")
24+
SET(WITH_LIBXSMM OFF CACHE STRING "Disable LIBXSMM" FORCE)
25+
return()
26+
ENDIF()
27+
28+
INCLUDE (ExternalProject)
29+
30+
SET(LIBXSMM_SOURCES_DIR ${THIRD_PARTY_PATH}/libxsmm)
31+
SET(LIBXSMM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/libxsmm)
32+
SET(LIBXSMM_INCLUDE_DIR "${LIBXSMM_INSTALL_DIR}/include" CACHE PATH "LIBXSMM include directory." FORCE)
33+
SET(LIBXSMM_LIBRARY_DIR "${LIBXSMM_INSTALL_DIR}/lib" CACHE PATH "LIBXSMM library directory." FORCE)
34+
SET(LIBXSMM_LIBS "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
35+
"${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
36+
37+
ExternalProject_Add(
38+
extern_libxsmm
39+
GIT_REPOSITORY "https://github.com/hfp/libxsmm.git"
40+
GIT_TAG "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
41+
PREFIX ${LIBXSMM_SOURCES_DIR}
42+
UPDATE_COMMAND ""
43+
CONFIGURE_COMMAND ""
44+
BUILD_IN_SOURCE 1
45+
BUILD_COMMAND $(MAKE) --silent PREFIX=${LIBXSMM_INSTALL_DIR} CXX=g++ CC=gcc WARP=0 install
46+
INSTALL_COMMAND ""
47+
)
48+
ADD_LIBRARY(libxsmm STATIC IMPORTED GLOBAL)
49+
SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmm.a")
50+
SET_PROPERTY(TARGET libxsmm PROPERTY IMPORTED_LOCATION "${LIBXSMM_LIBRARY_DIR}/libxsmmnoblas.a")
51+
52+
MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
53+
include_directories(${LIBXSMM_INCLUDE_DIR})
54+
ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
55+
ADD_DEPENDENCIES(libxsmm extern_libxsmm)
56+
LIST(APPEND external_project_dependencies libxsmm)
57+

cmake/external/openblas.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,11 @@ ELSE()
121121
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
122122
ENDIF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
123123

124+
IF(WITH_LIBXSMM)
125+
TARGET_LINK_LIBRARIES(cblas ${LIBXSMM_LIBS})
126+
ADD_DEPENDENCIES(cblas extern_libxsmm)
127+
ENDIF()
128+
124129
IF(NOT ${CBLAS_FOUND})
125130
ADD_DEPENDENCIES(cblas extern_openblas)
126131
LIST(APPEND external_project_dependencies cblas)

cmake/external/python.cmake

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,9 @@ ENDIF()
1818

1919
INCLUDE(python_module)
2020

21-
FIND_PACKAGE(PythonInterp 2.7)
22-
FIND_PACKAGE(PythonLibs 2.7)
21+
FIND_PACKAGE(PythonInterp ${PY_VERSION})
22+
FIND_PACKAGE(PythonLibs ${PY_VERSION})
23+
2324
# Fixme: Maybe find a static library. Get SHARED/STATIC by FIND_PACKAGE.
2425
ADD_LIBRARY(python SHARED IMPORTED GLOBAL)
2526
SET_PROPERTY(TARGET python PROPERTY IMPORTED_LOCATION ${PYTHON_LIBRARIES})

paddle/fluid/framework/details/multi_devices_graph_builder.cc

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -276,13 +276,22 @@ std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
276276
}
277277
}
278278

279-
// Insert BCast Ops
280-
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
281-
auto &to_bcast_set = bcast_var_name_set[dev_id];
282-
for (auto &bcast_name : to_bcast_set) {
283-
CreateBroadcastOp(&result, bcast_name, dev_id);
279+
bool use_gpu = false;
280+
#ifdef PADDLE_WITH_CUDA
281+
use_gpu = nccl_ctxs_ != nullptr;
282+
#endif
283+
284+
if (use_gpu ||
285+
strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce) {
286+
// Insert BCast Ops
287+
for (size_t dev_id = 0; dev_id < bcast_var_name_set.size(); ++dev_id) {
288+
auto &to_bcast_set = bcast_var_name_set[dev_id];
289+
for (auto &bcast_name : to_bcast_set) {
290+
CreateBroadcastOp(&result, bcast_name, dev_id);
291+
}
284292
}
285293
}
294+
286295
/*
287296
Dependency graph has been constructed. However, there are still data
288297
hazards need to be handled.
@@ -412,14 +421,19 @@ int MultiDevSSAGraphBuilder::GetOpDeviceID(const OpDesc &op) const {
412421
if (strategy_.reduce_ != BuildStrategy::ReduceStrategy::kReduce) {
413422
return -1;
414423
}
415-
416-
for (auto &varname : op.InputArgumentNames()) {
417-
int dev_id = GetVarDeviceID(varname);
418-
if (dev_id != -1) {
419-
return dev_id;
420-
}
424+
int op_role = boost::get<int>(
425+
op.GetAttr(framework::OpProtoAndCheckerMaker::OpRoleAttrName()));
426+
if (op_role != static_cast<int>(framework::OpRole::kOptimize)) {
427+
return -1;
421428
}
422-
return -1;
429+
auto param_grad = boost::get<std::vector<std::string>>(
430+
op.GetAttr(OpProtoAndCheckerMaker::OpRoleVarAttrName()));
431+
432+
PADDLE_ENFORCE_EQ(param_grad.size(), 2U);
433+
int dev_id = GetVarDeviceID(param_grad[1]);
434+
PADDLE_ENFORCE_NE(dev_id, -1, "dev_id should not be -1.[%s, %s]", op.Type(),
435+
param_grad[0]);
436+
return dev_id;
423437
}
424438

425439
int MultiDevSSAGraphBuilder::GetVarDeviceID(const std::string &varname) const {

paddle/fluid/framework/parallel_executor.cc

Lines changed: 18 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -45,6 +45,7 @@ class ParallelExecutorPrivate {
4545
#endif
4646
bool own_local_scope_;
4747
bool use_cuda_;
48+
bool use_all_reduce_;
4849
};
4950

5051
std::vector<Scope *> &ParallelExecutor::GetLocalScopes() {
@@ -62,6 +63,14 @@ ParallelExecutor::ParallelExecutor(
6263
: member_(new ParallelExecutorPrivate(places)) {
6364
member_->global_scope_ = scope;
6465
member_->use_cuda_ = exec_strategy.use_cuda_;
66+
member_->use_all_reduce_ =
67+
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
68+
69+
if (!member_->use_all_reduce_) {
70+
PADDLE_ENFORCE(places.size() > 1,
71+
"If you set build_strategy.reduce with 'Reduce',"
72+
"the number of places must be greater than 1.");
73+
}
6574

6675
// Step 1. Bcast the params to devs.
6776
// Create local scopes
@@ -117,7 +126,7 @@ ParallelExecutor::ParallelExecutor(
117126
#ifdef PADDLE_WITH_CUDA
118127
builder_factory.SetNCCLContextMap(member_->nccl_ctxs_.get());
119128
#else
120-
PADDLE_THROW("Not compiled with CUDA");
129+
PADDLE_THROW("Not compiled with CUDA.");
121130
#endif
122131
}
123132

@@ -133,7 +142,7 @@ ParallelExecutor::ParallelExecutor(
133142

134143
void ParallelExecutor::BCastParamsToDevs(
135144
const std::unordered_set<std::string> &vars) const {
136-
// the the initializing bcast, all vars would be bcast from device(0),
145+
// the initializing bcast, all vars would be bcast from device(0),
137146
// otherwise
138147
// bcast from the specified device.
139148
bool initializing = builder_.get() == nullptr ? true : false;
@@ -209,9 +218,13 @@ void ParallelExecutor::BCastParamsToDevs(
209218

210219
auto local_scope = member_->local_scopes_[i];
211220
auto *t = local_scope->Var(var)->GetMutable<LoDTensor>();
212-
t->Resize(dims);
213-
t->mutable_data(cpu, main_tensor.type());
214-
paddle::framework::TensorCopy(main_tensor, cpu, t);
221+
if (member_->use_all_reduce_ || member_->use_cuda_) {
222+
t->Resize(dims);
223+
t->mutable_data(cpu, main_tensor.type());
224+
paddle::framework::TensorCopy(main_tensor, cpu, t);
225+
} else {
226+
t->ShareDataWith(main_tensor);
227+
}
215228
}
216229
}
217230
}

paddle/fluid/operators/math/blas.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,10 @@
2121
#include "paddle/fluid/platform/dynload/mklml.h"
2222
#endif
2323

24+
#ifdef PADDLE_WITH_LIBXSMM
25+
#include <libxsmm.h>
26+
#endif
27+
2428
#ifdef PADDLE_USE_OPENBLAS
2529
#include <cblas.h>
2630
#endif

paddle/fluid/operators/math/blas_impl.h

Lines changed: 58 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414
#pragma once
15+
#include <limits>
1516
#include <vector>
1617
#include "paddle/fluid/operators/math/math_function.h"
1718

@@ -30,6 +31,12 @@ struct CBlas<float> {
3031
platform::dynload::cblas_sgemm(args...);
3132
}
3233

34+
#ifdef PADDLE_WITH_LIBXSMM
35+
template <typename... ARGS>
36+
static void SMM_GEMM(ARGS... args) {
37+
libxsmm_sgemm(args...);
38+
}
39+
#endif
3340
template <typename... ARGS>
3441
static void AXPY(ARGS... args) {
3542
platform::dynload::cblas_saxpy(args...);
@@ -63,6 +70,12 @@ struct CBlas<double> {
6370
platform::dynload::cblas_dgemm(args...);
6471
}
6572

73+
#ifdef PADDLE_WITH_LIBXSMM
74+
template <typename... ARGS>
75+
static void SMM_GEMM(ARGS... args) {
76+
libxsmm_dgemm(args...);
77+
}
78+
#endif
6679
template <typename... ARGS>
6780
static void AXPY(ARGS... args) {
6881
platform::dynload::cblas_daxpy(args...);
@@ -140,13 +153,43 @@ struct CBlas<double> {
140153
template <>
141154
struct CBlas<platform::float16> {
142155
static void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
156+
static void SMM_GEMM(...) {
157+
PADDLE_THROW("float16 SMM_GEMM not supported on CPU");
158+
}
143159
#ifdef PADDLE_WITH_MKLML
144160
static void GEMM_BATCH(...) {
145161
PADDLE_THROW("float16 GEMM_BATCH not supported on CPU");
146162
}
147163
#endif
148164
};
149165

166+
template <typename T>
167+
inline bool UseXSMM(const int &m, const int &n, const int &k, bool transa,
168+
bool transb, const T &alpha, const T &beta) {
169+
#ifdef PADDLE_WITH_LIBXSMM
170+
// Refer to https://github.com/hfp/libxsmm/blob/master/README.md
171+
// But the threshold is custom
172+
constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
173+
if (m * n * k > LIBXSMM_THRESHOLD || transa || transb ||
174+
std::abs<T>(alpha - static_cast<T>(1) >
175+
std::numeric_limits<T>::epsilon()) ||
176+
std::abs<T>(beta) > std::numeric_limits<T>::epsilon()) {
177+
return false;
178+
} else {
179+
return true;
180+
}
181+
#endif
182+
return false;
183+
}
184+
185+
template <>
186+
inline bool UseXSMM<platform::float16>(const int &m, const int &n, const int &k,
187+
bool transa, bool transb,
188+
const platform::float16 &alpha,
189+
const platform::float16 &beta) {
190+
return false;
191+
}
192+
150193
template <>
151194
template <typename T>
152195
void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -156,8 +199,21 @@ void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
156199
int lda = (transA == CblasNoTrans) ? K : M;
157200
int ldb = (transB == CblasNoTrans) ? N : K;
158201
int ldc = N;
159-
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
160-
beta, C, ldc);
202+
#ifdef PADDLE_WITH_LIBXSMM
203+
if (UseXSMM(M, N, K, transA != CblasNoTrans, transB != CblasNoTrans, alpha,
204+
beta)) {
205+
// Note: SMM use ColMajor
206+
const char transa = 'N';
207+
const char transb = 'N';
208+
CBlas<T>::SMM_GEMM(&transa, &transb, &N, &M, &K, &alpha, B, &ldb, A, &lda,
209+
&beta, C, &ldc);
210+
} else {
211+
#endif
212+
CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B,
213+
ldb, beta, C, ldc);
214+
#ifdef PADDLE_WITH_LIBXSMM
215+
}
216+
#endif
161217
}
162218

163219
template <>

0 commit comments

Comments
 (0)