Skip to content

Commit a02ce58

Browse files
committed
Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into revert_vlog
test=develop
2 parents 30e47bc + 12e1719 commit a02ce58

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+2647
-563
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ if (NOT WIN32)
214214
# there is no official support of warpctc, nccl, cupti in windows
215215
include(external/warpctc) # download, build, install warpctc
216216
include(cupti)
217+
include(external/gzstream)
217218
endif (NOT WIN32)
218219

219220
if(WITH_DISTRIBUTE)

cmake/external/gzstream.cmake

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
16+
IF(MOBILE_INFERENCE)
17+
return()
18+
ENDIF()
19+
20+
include (ExternalProject)
21+
22+
# NOTE: gzstream is needed when linking with ctr reader.
23+
24+
SET(GZSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/gzstream)
25+
SET(GZSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gzstream)
26+
SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream include directory." FORCE)
27+
28+
ExternalProject_Add(
29+
extern_gzstream
30+
GIT_REPOSITORY "https://github.com/jacquesqiao/gzstream.git"
31+
GIT_TAG ""
32+
PREFIX ${GZSTREAM_SOURCES_DIR}
33+
UPDATE_COMMAND ""
34+
CONFIGURE_COMMAND ""
35+
BUILD_IN_SOURCE 1
36+
BUILD_COMMAND make -j8
37+
INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/
38+
&& cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib
39+
&& cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include
40+
)
41+
42+
ADD_LIBRARY(gzstream STATIC IMPORTED GLOBAL)
43+
SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION
44+
"${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a")
45+
46+
include_directories(${GZSTREAM_INCLUDE_DIR})
47+
ADD_DEPENDENCIES(gzstream extern_gzstream zlib)

cmake/inference_lib.cmake

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -186,8 +186,7 @@ set(module "inference")
186186
copy(inference_lib DEPS ${inference_deps}
187187
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
188188
${src_dir}/${module}/api/paddle_*.h
189-
${PADDLE_BINARY_DIR}/paddle/fluid/inference/api/paddle_inference_pass.h
190-
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
189+
DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module}
191190
)
192191

193192
set(module "platform")

paddle/fluid/API.spec

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ paddle.fluid.layers.warpctc ArgSpec(args=['input', 'label', 'blank', 'norm_by_ti
9797
paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs=None, keywords=None, defaults=None)
9898
paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,))
9999
paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None))
100-
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0))
101-
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
100+
paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name', 'sampler', 'custom_dist', 'seed', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, 'uniform', None, 0, False))
101+
paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name', 'path_table', 'path_code', 'is_custom', 'is_sparse'], varargs=None, keywords=None, defaults=(None, None, None, None, None, False, False))
102102
paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None))
103103
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
104104
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)

paddle/fluid/framework/details/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,11 +39,12 @@ if (WITH_GPU)
3939
endif()
4040

4141
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
42+
cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass)
4243

4344
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
4445
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
4546

46-
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass)
47+
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass)
4748
if (WITH_GPU)
4849
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
4950
endif()
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include <algorithm>
16+
#include <string>
17+
#include <unordered_map>
18+
#include <unordered_set>
19+
#include <vector>
20+
21+
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
22+
#include "paddle/fluid/framework/details/all_reduce_op_handle.h"
23+
#include "paddle/fluid/framework/details/multi_devices_helper.h"
24+
#include "paddle/fluid/framework/details/op_graph_view.h"
25+
#include "paddle/fluid/framework/details/var_handle.h"
26+
#include "paddle/fluid/framework/ir/graph_helper.h"
27+
#include "paddle/fluid/framework/op_proto_maker.h"
28+
29+
namespace paddle {
30+
namespace framework {
31+
namespace details {
32+
33+
static constexpr char kAllOpDescs[] = "all_op_descs";
34+
35+
VarHandle* GetValidInput(const OpHandleBase* a) {
36+
for (auto p : a->Inputs()) {
37+
VarHandle* b = dynamic_cast<VarHandle*>(p);
38+
if (b) {
39+
return b;
40+
}
41+
}
42+
43+
return nullptr;
44+
}
45+
46+
std::unique_ptr<ir::Graph> AllReduceDepsPass::ApplyImpl(
47+
std::unique_ptr<ir::Graph> graph) const {
48+
auto graph_ops = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
49+
50+
// get vars order
51+
int order = 0;
52+
std::unordered_map<std::string, int> vars;
53+
// TODO(gongwb): use graph topology sort to find the order of operators.
54+
// Note that must assert topology sort is stable
55+
auto& ops = Get<const std::vector<OpDesc*>>(kAllOpDescs);
56+
for (auto* op_desc : ops) {
57+
auto outputs = op_desc->Outputs();
58+
for (auto& o_it : outputs) {
59+
for (auto& v : o_it.second) { // values
60+
vars[v] = order;
61+
}
62+
}
63+
order++;
64+
}
65+
66+
std::vector<OpHandleBase*> dist_ops;
67+
// get allreduce ops.
68+
for (auto& op : graph_ops) {
69+
// FIXME(gongwb):add broad cast.
70+
if (op->Name() == "all_reduce" || op->Name() == "reduce") {
71+
dist_ops.push_back(op);
72+
}
73+
}
74+
75+
VLOG(10) << "dist_ops size:" << dist_ops.size() << std::endl;
76+
77+
std::sort(dist_ops.begin(), dist_ops.end(), [&](OpHandleBase* op1,
78+
OpHandleBase* op2) {
79+
VarHandle* i0 = dynamic_cast<VarHandle*>(GetValidInput(op1));
80+
VarHandle* i1 = dynamic_cast<VarHandle*>(GetValidInput(op2));
81+
82+
PADDLE_ENFORCE(i0 != nullptr && i1 != nullptr, "%s convert to %s error",
83+
op1->DebugString(), op2->DebugString());
84+
85+
auto l_it = vars.find(i0->name_);
86+
auto r_it = vars.find(i1->name_);
87+
88+
if (l_it->second < r_it->second) return true;
89+
90+
if (l_it->second == r_it->second) {
91+
return i0->name_ < i1->name_;
92+
}
93+
94+
return false;
95+
});
96+
97+
// add dependency.
98+
auto& sorted_ops = dist_ops;
99+
for (size_t i = 1; i < sorted_ops.size(); ++i) {
100+
auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar());
101+
102+
auto* pre_op = sorted_ops[i - 1];
103+
auto* op = sorted_ops[i];
104+
105+
pre_op->AddOutput(dep_var);
106+
op->AddInput(dep_var);
107+
graph->Get<GraphDepVars>(kGraphDepVars).emplace(dep_var);
108+
109+
VLOG(10) << "add all_reduce sequential dependencies between " << pre_op
110+
<< " and " << op;
111+
112+
VLOG(10) << "pre_op:" << pre_op->DebugString()
113+
<< ", op:" << op->DebugString();
114+
}
115+
116+
return graph;
117+
}
118+
119+
} // namespace details
120+
} // namespace framework
121+
} // namespace paddle
122+
123+
REGISTER_PASS(all_reduce_deps_pass,
124+
paddle::framework::details::AllReduceDepsPass)
125+
.RequirePassAttr(paddle::framework::details::kAllOpDescs);
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#pragma once
16+
17+
#include "paddle/fluid/framework/ir/graph.h"
18+
#include "paddle/fluid/framework/ir/pass.h"
19+
20+
namespace paddle {
21+
namespace framework {
22+
namespace details {
23+
24+
// TODO(gongwb): overlap allreduce with backward computation.
25+
class AllReduceDepsPass : public ir::Pass {
26+
protected:
27+
std::unique_ptr<ir::Graph> ApplyImpl(
28+
std::unique_ptr<ir::Graph> graph) const override;
29+
};
30+
31+
} // namespace details
32+
} // namespace framework
33+
} // namespace paddle

paddle/fluid/framework/details/build_strategy.cc

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ limitations under the License. */
1616

1717
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
1818
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
19+
#include "paddle/fluid/framework/details/reduce_op_handle.h"
1920
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
2021
#include "paddle/fluid/framework/ir/graph.h"
2122
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
@@ -24,6 +25,10 @@ namespace paddle {
2425
namespace framework {
2526
namespace details {
2627

28+
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
29+
return (!strategy.enable_sequential_execution_ && strategy.num_trainers_ > 1);
30+
}
31+
2732
class ParallelExecutorPassBuilder : public ir::PassBuilder {
2833
public:
2934
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
@@ -70,6 +75,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
7075
// Verify that the graph is correct for multi-device executor.
7176
AppendPass("multi_devices_check_pass");
7277

78+
if (SeqOnlyAllReduceOps(strategy)) {
79+
AppendPass("all_reduce_deps_pass");
80+
}
81+
7382
if (strategy_.remove_unnecessary_lock_) {
7483
AppendPass("modify_op_lock_and_record_event_pass");
7584
}
@@ -124,6 +133,17 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
124133
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
125134
#endif
126135
} else if (pass->Type() == "sequential_execution_pass") {
136+
VLOG(1) << "set enable_sequential_execution:"
137+
<< enable_sequential_execution_;
138+
139+
pass->Erase(kAllOpDescs);
140+
pass->Set<const std::vector<OpDesc *>>(
141+
kAllOpDescs,
142+
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
143+
} else if (pass->Type() == "all_reduce_deps_pass") {
144+
VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this)
145+
<< ", num_trainers:" << num_trainers_;
146+
127147
pass->Erase(kAllOpDescs);
128148
pass->Set<const std::vector<OpDesc *>>(
129149
kAllOpDescs,
@@ -144,4 +164,5 @@ USE_PASS(multi_devices_pass);
144164
USE_PASS(multi_devices_check_pass);
145165
USE_PASS(multi_devices_print_pass);
146166
USE_PASS(sequential_execution_pass);
167+
USE_PASS(all_reduce_deps_pass);
147168
USE_PASS(modify_op_lock_and_record_event_pass);

paddle/fluid/framework/details/build_strategy.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,7 @@ struct BuildStrategy {
7373

7474
bool fuse_broadcast_op_{false};
7575

76+
int num_trainers_{1};
7677
bool remove_unnecessary_lock_{false};
7778

7879
// NOTE:

paddle/fluid/framework/operator.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ class OperatorBase;
7171
class ExecutionContext;
7272

7373
/**
74-
* OperatorBase has the basic element that Net will call to do computation.
74+
* OperatorBase has the basic elements that Net will call to do computation.
7575
* Only CreateOperator from OpRegistry will new Operator directly. User
7676
* should always construct a proto message OpDesc and call
7777
* OpRegistry::CreateOp(op_desc) to get an Operator instance.

0 commit comments

Comments
 (0)