Skip to content

Commit ff052c0

Browse files
committed
merge develop
2 parents c6a5c4b + 3ae97aa commit ff052c0

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+1340
-239
lines changed

CMakeLists.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -204,6 +204,11 @@ include(external/snappy) # download snappy
204204
include(external/snappystream)
205205
include(external/threadpool)
206206

207+
include(flags) # set paddle compile flags
208+
include(cudnn) # set cudnn libraries, must before configure
209+
include(cupti)
210+
include(configure) # add paddle env configuration
211+
207212
if(WITH_GPU)
208213
include(cuda)
209214
include(tensorrt)
@@ -212,15 +217,11 @@ elseif()
212217
set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
213218
endif()
214219

215-
include(cudnn) # set cudnn libraries, must before configure
216-
include(cupti)
217-
include(configure) # add paddle env configuration
218220
include(generic) # simplify cmake module
219221
include(package) # set paddle packages
220222
include(ccache) # set ccache for compilation
221223
include(util) # set unittest and link libs
222224
include(rdma) # set rdma libraries
223-
include(flags) # set paddle compile flags
224225
include(version) # set PADDLE_VERSION
225226
include(coveralls) # set code coverage
226227
include(inference_lib) # add paddle fluid inference libraries

cmake/configure.cmake

Lines changed: 16 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,16 @@ if(NOT WITH_PROFILER)
5050
endif(NOT WITH_PROFILER)
5151

5252
if(NOT CMAKE_CROSSCOMPILING)
53-
if(WITH_AVX AND AVX_FOUND)
53+
if(WITH_AVX AND AVX512F_FOUND)
54+
set(SIMD_FLAG ${AVX512F_FLAG})
55+
elseif(WITH_AVX AND AVX2_FOUND)
56+
set(SIMD_FLAG ${AVX2_FLAG})
57+
elseif(WITH_AVX AND AVX_FOUND)
5458
set(SIMD_FLAG ${AVX_FLAG})
5559
elseif(SSE3_FOUND)
5660
set(SIMD_FLAG ${SSE3_FLAG})
5761
endif()
5862
endif()
59-
if(UNIX AND NOT APPLE)
60-
# except apple from nix*Os family
61-
set(LINUX TRUE)
62-
endif(UNIX AND NOT APPLE)
6363

6464
if(NOT WITH_GOLANG)
6565
add_definitions(-DPADDLE_WITHOUT_GOLANG)
@@ -103,15 +103,20 @@ if(WITH_GPU)
103103
endif()
104104
if(WITH_ANAKIN)
105105
if(${CUDA_VERSION_MAJOR} VERSION_LESS 8)
106-
message(FATAL_ERROR "Anakin needs CUDA >= 8.0 to compile")
106+
message(WARNING "Anakin needs CUDA >= 8.0 to compile. Force WITH_ANAKIN=OFF")
107+
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDA >= 8.0." FORCE)
107108
endif()
108109
if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
109-
message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
110+
message(WARNING "Anakin needs CUDNN >= 7.0 to compile. Force WITH_ANAKIN=OFF")
111+
set(WITH_ANAKIN OFF CACHE STRING "Anakin is valid only when CUDNN >= 7.0." FORCE)
110112
endif()
111-
set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
112-
set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
113-
message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
114-
message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
113+
endif()
114+
if(WITH_ANAKIN)
115+
# NOTICE(minqiyang): the end slash is important because $CUDNN_INCLUDE_DIR
116+
# is a softlink to real cudnn.h directory
117+
set(ENV{CUDNN_INCLUDE_DIR} "${CUDNN_INCLUDE_DIR}/")
118+
get_filename_component(CUDNN_LIBRARY_DIR ${CUDNN_LIBRARY} DIRECTORY)
119+
set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY_DIR})
115120
endif()
116121
elseif(WITH_AMD_GPU)
117122
add_definitions(-DPADDLE_WITH_HIP)

cmake/cudnn.cmake

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,8 +25,25 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
2525
$ENV{CUDNN_ROOT}
2626
$ENV{CUDNN_ROOT}/lib64
2727
$ENV{CUDNN_ROOT}/lib
28-
/usr/lib)
29-
find_library(CUDNN_LIBRARY NAMES libcudnn.so libcudnn.dylib # libcudnn_static.a
28+
/usr/lib
29+
${CUDA_TOOLKIT_ROOT_DIR}
30+
${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
31+
)
32+
set(CUDNN_LIB_NAME "")
33+
if (LINUX)
34+
set(CUDNN_LIB_NAME "libcudnn.so")
35+
endif(LINUX)
36+
37+
if(WIN32)
38+
# only support cudnn7
39+
set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
40+
endif(WIN32)
41+
42+
if(Apple)
43+
set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
44+
endif(Apple)
45+
46+
find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
3047
PATHS ${CUDNN_CHECK_LIBRARY_DIRS} ${CUDNN_INCLUDE_DIR} ${__libpath_hist}
3148
NO_DEFAULT_PATH
3249
DOC "Path to cuDNN library.")

cmake/external/anakin.cmake

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -19,17 +19,17 @@ execute_process(COMMAND bash -c "cd ${ANAKIN_SOURCE_DIR}; wget -q --no-check-cer
1919
include_directories(${ANAKIN_INCLUDE})
2020
include_directories(${ANAKIN_INCLUDE}/saber/)
2121

22-
set(ANAKIN_COMPILE_EXTRA_FLAGS
22+
set(ANAKIN_COMPILE_EXTRA_FLAGS
2323
-Wno-error=unused-but-set-variable -Wno-unused-but-set-variable
24-
-Wno-error=unused-variable -Wno-unused-variable
24+
-Wno-error=unused-variable -Wno-unused-variable
2525
-Wno-error=format-extra-args -Wno-format-extra-args
26-
-Wno-error=comment -Wno-comment
27-
-Wno-error=format -Wno-format
26+
-Wno-error=comment -Wno-comment
27+
-Wno-error=format -Wno-format
2828
-Wno-error=switch -Wno-switch
29-
-Wno-error=return-type -Wno-return-type
29+
-Wno-error=return-type -Wno-return-type
3030
-Wno-error=non-virtual-dtor -Wno-non-virtual-dtor
3131
-Wno-sign-compare
32-
-Wno-reorder
32+
-Wno-reorder
3333
-Wno-error=cpp)
3434

3535
ExternalProject_Add(
@@ -47,6 +47,7 @@ ExternalProject_Add(
4747
-DPROTOBUF_ROOT=${THIRD_PARTY_PATH}/install/protobuf
4848
-DMKLML_ROOT=${THIRD_PARTY_PATH}/install/mklml
4949
-DCUDNN_ROOT=${CUDNN_ROOT}
50+
-DCUDNN_INCLUDE_DIR=${CUDNN_INCLUDE_DIR}
5051
${EXTERNAL_OPTIONAL_ARGS}
5152
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${ANAKIN_INSTALL_DIR}
5253
)

cmake/flags.cmake

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,11 @@ else()
142142
${GPU_COMMON_FLAGS})
143143
endif()
144144

145+
if(UNIX AND NOT APPLE)
146+
# except apple from nix*Os family
147+
set(LINUX TRUE)
148+
endif(UNIX AND NOT APPLE)
149+
145150

146151
foreach(flag ${COMMON_FLAGS})
147152
safe_set_cflag(CMAKE_C_FLAGS ${flag})

cmake/simd.cmake

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ if(CMAKE_COMPILER_IS_GNUCC OR CMAKE_COMPILER_IS_GNUCXX OR CMAKE_CXX_COMPILER_ID
1010
set(SSE3_FLAG "-msse3")
1111
set(AVX_FLAG "-mavx")
1212
set(AVX2_FLAG "-mavx2")
13+
set(AVX512F_FLAG "-mavx512f")
1314
elseif(MSVC)
1415
set(MMX_FLAG "/arch:MMX")
1516
set(SSE2_FLAG "/arch:SSE2")
@@ -81,5 +82,16 @@ int main()
8182
return 0;
8283
}" AVX2_FOUND)
8384

85+
# Check AVX512F
86+
set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG})
87+
set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE)
88+
CHECK_CXX_SOURCE_RUNS("
89+
#include <immintrin.h>
90+
int main()
91+
{
92+
__m512i a = _mm512_undefined_epi32();
93+
return 0;
94+
}" AVX512F_FOUND)
95+
8496
set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED})
85-
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND)
97+
mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND)

paddle/fluid/framework/CMakeLists.txt

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -99,8 +99,13 @@ else()
9999
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method)
100100
endif()
101101

102-
103-
cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass)
102+
if (NOT WIN32)
103+
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
104+
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor
105+
graph graph_viz_pass multi_devices_graph_pass
106+
multi_devices_graph_print_pass multi_devices_graph_check_pass
107+
fast_threaded_ssa_graph_executor)
108+
endif() # NOT WIN32
104109

105110
cc_library(prune SRCS prune.cc DEPS framework_proto)
106111
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)

paddle/fluid/framework/details/CMakeLists.txt

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,3 +42,5 @@ cc_test(gather_op_test SRCS gather_op_handle_test.cc DEPS var_handle op_handle_b
4242
cc_library(scope_buffered_ssa_graph_executor SRCS scope_buffered_ssa_graph_executor.cc DEPS ssa_graph_executor)
4343
#cc_test(reduce_op_handle_test SRCS reduce_op_handle_test.cc DEPS var_handle op_handle_base scope ddim memory
4444
# device_context reduce_op_handle )
45+
cc_library(fast_threaded_ssa_graph_executor SRCS fast_threaded_ssa_graph_executor.cc
46+
DEPS fetch_op_handle ssa_graph_executor scope simple_threadpool device_context)

paddle/fluid/framework/details/execution_strategy.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@ namespace framework {
1919
namespace details {
2020

2121
struct ExecutionStrategy {
22+
enum ExecutorType { kDefault = 0, kExperimental = 1 };
23+
2224
size_t num_threads_{0};
2325
bool use_cuda_{true};
2426
bool allow_op_delay_{false};
2527
size_t num_iteration_per_drop_scope_{100};
28+
ExecutorType type_{kDefault};
2629
};
2730

2831
} // namespace details
Lines changed: 175 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,175 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
15+
#include <string>
16+
#include <vector>
17+
#include "paddle/fluid/framework/details/fetch_op_handle.h"
18+
#include "paddle/fluid/framework/details/multi_devices_helper.h"
19+
20+
namespace paddle {
21+
namespace framework {
22+
namespace details {
23+
24+
FastThreadedSSAGraphExecutor::FastThreadedSSAGraphExecutor(
25+
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
26+
const std::vector<platform::Place> &places,
27+
std::unique_ptr<ir::Graph> &&graph)
28+
: strategy_(strategy),
29+
local_scopes_(local_scopes),
30+
places_(places),
31+
graph_(std::move(graph)),
32+
pool_(strategy.num_threads_ +
33+
1), // add one more thread for generate op_deps
34+
fetch_ctxs_(places) {
35+
auto &ops = graph_->Get<details::GraphOps>("ops");
36+
37+
for (auto &op : ops) {
38+
int dep = static_cast<int>(op->NotReadyInputSize());
39+
op_deps_.emplace(op.get(), dep);
40+
if (dep == 0) {
41+
bootstrap_ops_.emplace_back(op.get());
42+
}
43+
}
44+
45+
PrepareAtomicOpDeps();
46+
}
47+
48+
FeedFetchList FastThreadedSSAGraphExecutor::Run(
49+
const std::vector<std::string> &fetch_tensors) {
50+
std::unique_ptr<std::unordered_map<OpHandleBase *, std::atomic<int>>>
51+
op_deps = atomic_op_deps_.get();
52+
PrepareAtomicOpDeps();
53+
54+
paddle::framework::FeedFetchList fetches;
55+
fetches.resize(fetch_tensors.size());
56+
std::unordered_map<std::string, std::vector<VarHandleBase *>> fetched_vars;
57+
std::vector<std::unique_ptr<ir::Node>> fetch_nodes;
58+
std::vector<std::unique_ptr<FetchOpHandle>> fetch_ops;
59+
60+
for (auto &fetch_var_name : fetch_tensors) {
61+
for (auto &var_map : graph_->Get<details::GraphVars>("vars")) {
62+
auto it = var_map.find(fetch_var_name);
63+
if (it != var_map.end()) {
64+
fetched_vars[fetch_var_name].push_back(it->second.rbegin()->get());
65+
}
66+
}
67+
}
68+
69+
for (size_t i = 0; i < fetch_tensors.size(); ++i) {
70+
auto &var_name = fetch_tensors[i];
71+
auto fetched_var_it = fetched_vars.find(var_name);
72+
PADDLE_ENFORCE(fetched_var_it != fetched_vars.end(),
73+
"Cannot find fetched variable.(Perhaps the main_program "
74+
"is not set to ParallelExecutor)");
75+
76+
auto &vars = fetched_var_it->second;
77+
78+
fetch_nodes.emplace_back(new ir::Node("fetch", ir::Node::Type::kOperation));
79+
auto *op = new FetchOpHandle(fetch_nodes.back().get(), &fetches, i,
80+
&local_scopes_);
81+
fetch_ops.emplace_back(op);
82+
83+
for (auto &p : places_) {
84+
op->SetDeviceContext(p, fetch_ctxs_.Get(p));
85+
}
86+
87+
for (auto *var : vars) {
88+
op->AddInput(var);
89+
}
90+
91+
(*op_deps)[op] = static_cast<int>(op->NotReadyInputSize());
92+
}
93+
94+
size_t num_complete = 0;
95+
remaining_ = 0;
96+
BlockingQueue<size_t> complete_q;
97+
for (auto op : bootstrap_ops_) {
98+
RunOpAsync(op_deps.get(), op, &complete_q);
99+
}
100+
101+
while (num_complete != op_deps->size()) {
102+
size_t num_comp = complete_q.Pop();
103+
if (num_comp == -1UL) {
104+
int remaining = 0;
105+
while (true) {
106+
remaining = remaining_;
107+
if (remaining == 0) {
108+
break;
109+
}
110+
for (int i = 0; i < remaining; ++i) {
111+
complete_q.Pop();
112+
}
113+
}
114+
exception_.ReThrow();
115+
}
116+
num_complete += num_comp;
117+
}
118+
// Wait FetchOps.
119+
if (!fetch_ops.empty()) {
120+
fetch_ops.clear();
121+
}
122+
return fetches;
123+
}
124+
void FastThreadedSSAGraphExecutor::RunOpAsync(
125+
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps,
126+
OpHandleBase *op, BlockingQueue<size_t> *complete_q) {
127+
++remaining_;
128+
this->pool_.enqueue([=] {
129+
OpHandleBase *op_to_run = op;
130+
size_t complete = 0;
131+
while (op_to_run != nullptr) {
132+
try {
133+
op_to_run->Run(strategy_.use_cuda_);
134+
++complete;
135+
} catch (...) {
136+
exception_.Catch(std::current_exception());
137+
--remaining_;
138+
complete_q->Push(-1UL);
139+
return;
140+
}
141+
auto &outputs = op_to_run->Outputs();
142+
op_to_run = nullptr;
143+
for (auto &output : outputs) {
144+
for (auto &pending_op : output->PendingOps()) {
145+
std::atomic<int> &deps = op_deps->at(pending_op);
146+
if (deps.fetch_sub(1) == 1) { // pending_op ready
147+
if (op_to_run == nullptr) {
148+
op_to_run = pending_op;
149+
} else {
150+
this->RunOpAsync(op_deps, pending_op, complete_q);
151+
}
152+
}
153+
}
154+
}
155+
}
156+
--remaining_;
157+
complete_q->Push(complete);
158+
});
159+
}
160+
void FastThreadedSSAGraphExecutor::PrepareAtomicOpDeps() {
161+
atomic_op_deps_ = pool_.enqueue([&] {
162+
std::unordered_map<OpHandleBase *, std::atomic<int>> *op_deps =
163+
new std::unordered_map<OpHandleBase *, std::atomic<int>>;
164+
for (auto &pair : op_deps_) {
165+
(*op_deps)[pair.first] = pair.second;
166+
}
167+
return std::unique_ptr<
168+
std::unordered_map<OpHandleBase *, std::atomic<int>>>(op_deps);
169+
});
170+
}
171+
172+
const ir::Graph &FastThreadedSSAGraphExecutor::Graph() const { return *graph_; }
173+
} // namespace details
174+
} // namespace framework
175+
} // namespace paddle

0 commit comments

Comments
 (0)