Skip to content

Commit ac0e0f5

Browse files
committed
merge develop
test=develop
2 parents fc12f38 + 4ef6f73 commit ac0e0f5

File tree

149 files changed

+5634
-2708
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

149 files changed

+5634
-2708
lines changed

benchmark/fluid/fluid_benchmark.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -179,7 +179,6 @@ def train_parallel(train_args, test_args, args, train_prog, test_prog,
179179
else:
180180
build_strategy.reduce_strategy = fluid.BuildStrategy(
181181
).ReduceStrategy.AllReduce
182-
build_strategy.fuse_broadcast_op = args.fuse_broadcast_op
183182

184183
avg_loss = train_args[0]
185184

paddle/fluid/API.spec

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -302,13 +302,16 @@ paddle.fluid.layers.sigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=
302302
paddle.fluid.layers.logsigmoid (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '81ccb7acafd06c7728e11581f5d342e3'))
303303
paddle.fluid.layers.exp (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e6b3e769413d96aab4176f96db25984b'))
304304
paddle.fluid.layers.tanh (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'e9d586a0b5bd05f67ee78048f9d503b6'))
305+
paddle.fluid.layers.atan (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '3a46e0b5f9ce82348406478e610f14c9'))
305306
paddle.fluid.layers.tanh_shrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '1e521554b9fdda9061ec6d306f0709b7'))
306307
paddle.fluid.layers.softshrink (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '9eef31597bbafa2bd49691e072296e13'))
307308
paddle.fluid.layers.sqrt (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '072a8541e0f632366bba10f67cb0db27'))
308309
paddle.fluid.layers.abs (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '64650ac42cf82e9920cb0b172b1d29fd'))
309310
paddle.fluid.layers.ceil (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c75d67dc5fe28f68e4cfffead4f698ad'))
310311
paddle.fluid.layers.floor (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '647b16c5da5ef909649ae02abb434973'))
311312
paddle.fluid.layers.cos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '485f2686bcc2fe37a4bd893769c8a3e2'))
313+
paddle.fluid.layers.acos (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '920a47734482276c069ba24c61c26b25'))
314+
paddle.fluid.layers.asin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cf4ee2c9b9d7293556f8c5173dfb5d2c'))
312315
paddle.fluid.layers.sin (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '01f1766aa76eff1df30147505b59f7c4'))
313316
paddle.fluid.layers.round (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'b47f5da13913d3e56bdb1e612a73f3f2'))
314317
paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'cc6ac2f14f03c52aaa83a59bf83b8d26'))

paddle/fluid/framework/CMakeLists.txt

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,10 +38,10 @@ if(WITH_GPU)
3838
nv_library(tensor SRCS tensor.cc .tensor_util.cu DEPS place memory data_type device_context)
3939
add_dependencies(tensor tensor_util)
4040
else()
41-
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context )
41+
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS place memory data_type device_context profiler)
4242
endif(WIN32)
4343
else()
44-
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context )
44+
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS place memory data_type device_context profiler)
4545
endif()
4646

4747
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
@@ -174,7 +174,7 @@ else()
174174
cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op)
175175
endif()
176176

177-
target_link_libraries(executor garbage_collector)
177+
target_link_libraries(executor garbage_collector while_op_helper)
178178

179179
cc_library(parallel_executor SRCS parallel_executor.cc DEPS
180180
threaded_ssa_graph_executor scope_buffered_ssa_graph_executor parallel_ssa_graph_executor

paddle/fluid/framework/details/CMakeLists.txt

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,8 @@ cc_library(inplace_op_pass SRCS inplace_op_pass.cc DEPS memory_optimize_pass op_
6161
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
6262
cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle)
6363
cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper)
64-
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass)
64+
cc_library(while_op_eager_deletion_pass SRCS while_op_eager_deletion_pass.cc DEPS while_op_helper graph_helper pass computation_op_handle)
65+
cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass while_op_eager_deletion_pass)
6566
cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper)
6667

6768
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)

paddle/fluid/framework/details/computation_op_handle.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
#pragma once
1616

17+
#include <memory>
1718
#include <string>
1819
#include <vector>
1920

@@ -31,6 +32,8 @@ class ComputationOpHandle : public OpHandleBase {
3132
ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place,
3233
size_t scope_idx);
3334

35+
OperatorBase *GetOp() { return op_.get(); }
36+
3437
std::string Name() const override;
3538

3639
const Scope *GetScope() const { return scope_; }

paddle/fluid/framework/details/eager_deletion_op_handle.cc

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
#include <memory>
16+
#include <unordered_set>
17+
#include <utility>
18+
1519
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
1620
#include "paddle/fluid/framework/lod_tensor_array.h"
1721
#include "paddle/fluid/framework/scope.h"
@@ -45,6 +49,7 @@ EagerDeletionOpHandle::EagerDeletionOpHandle(
4549
}
4650
}
4751
#endif
52+
PADDLE_ENFORCE(!var_names_.empty(), "Var names cannot be empty");
4853
}
4954

5055
EagerDeletionOpHandle::~EagerDeletionOpHandle() {
@@ -60,15 +65,20 @@ EagerDeletionOpHandle::~EagerDeletionOpHandle() {
6065
std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; }
6166

6267
void EagerDeletionOpHandle::RunImpl() {
63-
auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
68+
Scope *exec_scope = nullptr;
6469
std::deque<std::shared_ptr<memory::Allocation>> garbages;
6570
for (auto &name : var_names_) {
6671
auto it = ref_cnts_->find(name);
67-
// Var not found, not reference count has not decreased to 0
72+
// Reference count has not decreased to 0
6873
if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) {
6974
continue;
7075
}
7176

77+
if (!exec_scope) {
78+
exec_scope = scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
79+
}
80+
81+
// Var not found
7282
auto *var = exec_scope->FindVar(name);
7383
if (var == nullptr) {
7484
continue;

paddle/fluid/framework/details/eager_deletion_pass.cc

Lines changed: 166 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,20 +12,173 @@
1212
// See the License for the specific language governing permissions and
1313
// limitations under the License.
1414

15+
#include <algorithm>
16+
#include <functional>
1517
#include <queue>
1618
#include <string>
19+
#include <tuple>
1720
#include <vector>
1821

1922
#include "paddle/fluid/framework/details/computation_op_handle.h"
2023
#include "paddle/fluid/framework/details/eager_deletion_op_handle.h"
21-
#include "paddle/fluid/framework/details/eager_deletion_pass.h"
2224
#include "paddle/fluid/framework/details/multi_devices_helper.h"
2325
#include "paddle/fluid/framework/ir/graph_helper.h"
2426

27+
DEFINE_double(memory_fraction_of_eager_deletion, 1.0,
28+
"Fraction of eager deletion. If less than 1.0, all variables in "
29+
"the program would be sorted according to its memory size, and "
30+
"only the FLAGS_memory_fraction_of_eager_deletion of the largest "
31+
"variables would be deleted.");
32+
2533
namespace paddle {
2634
namespace framework {
2735
namespace details {
2836

37+
// op -> variables which can be deleted after op runs
38+
using OpToVarNameSetMap =
39+
std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>;
40+
41+
// Check whether the variable is LoDTensor based on static VarDesc info
42+
static bool IsLoDTensor(VarDesc *var) {
43+
return var->Proto()->type().type() == proto::VarType::LOD_TENSOR;
44+
}
45+
46+
// Get memory size of LoDTensor
47+
static int64_t GetMemorySize(
48+
const std::unordered_map<std::string, std::vector<VarHandle *>> &vars,
49+
const std::string &var_name) {
50+
auto *var_desc = TryGetLatestVarDesc(vars.at(var_name));
51+
PADDLE_ENFORCE_NOT_NULL(var_desc);
52+
PADDLE_ENFORCE(IsLoDTensor(var_desc));
53+
auto dims = var_desc->GetShape();
54+
return SizeOfType(var_desc->GetDataType()) *
55+
std::accumulate(dims.begin(), dims.end(), static_cast<int64_t>(1),
56+
std::multiplies<int64_t>());
57+
}
58+
59+
// Split all variables in the graph into LoDTensor and Non-LoDTensor (e.g.
60+
// SelectedRows, LoDTensorArray)
61+
// Since partial GC is based on static analysis of memory size of each variable
62+
// So we should skip SelectedRows and LoDTensorArray here
63+
static void SplitIntoLoDTensorAndNonLoDTensorVars(
64+
const OpToVarNameSetMap &m, const GraphVars &vars,
65+
OpToVarNameSetMap *lod_tensors, OpToVarNameSetMap *other_vars) {
66+
lod_tensors->clear();
67+
other_vars->clear();
68+
69+
for (auto &op_vars_pair : m) {
70+
for (auto &var_name : op_vars_pair.second) {
71+
auto *var_desc = TryGetLatestVarDesc(
72+
vars[op_vars_pair.first->GetScopeIdx()].at(var_name));
73+
if (IsLoDTensor(var_desc)) {
74+
(*lod_tensors)[op_vars_pair.first].insert(var_name);
75+
} else {
76+
(*other_vars)[op_vars_pair.first].insert(var_name);
77+
}
78+
}
79+
}
80+
}
81+
82+
struct GCVarInfo {
83+
GCVarInfo(const std::string &name, int64_t memory_size,
84+
ComputationOpHandle *op, size_t scope_idx)
85+
: name_(name),
86+
memory_size_(memory_size),
87+
op_(op),
88+
scope_idx_(scope_idx) {}
89+
90+
std::string name_; // variable name
91+
int64_t memory_size_; // memory size
92+
ComputationOpHandle *op_; // op after which the variable could be deleted
93+
size_t scope_idx_; // scope index where the variable locates
94+
95+
int64_t AbsMemorySize() const { return std::abs(memory_size_); }
96+
};
97+
98+
// Delete delete_lod_tensor_only is not used currently
99+
static OpToVarNameSetMap ShrinkGCVars(
100+
const OpToVarNameSetMap &m, const GraphVars &vars,
101+
const std::vector<platform::Place> &places, double fraction_of_memory_size,
102+
bool delete_lod_tensor_only = false) {
103+
// Do not perform gc when fraction_of_memory_size = 0
104+
if (fraction_of_memory_size <= 0.0) return {};
105+
106+
/**
107+
* Step 1: Split all variables into LoDTensor and Non-LoDTensor.
108+
* We can only calculate memory size of LoDTensors
109+
*/
110+
OpToVarNameSetMap lod_tensors, other_vars;
111+
SplitIntoLoDTensorAndNonLoDTensorVars(m, vars, &lod_tensors, &other_vars);
112+
113+
// Perform complete gc when fraction_of_memory_size >= 1
114+
if (fraction_of_memory_size >= 1.0) {
115+
return delete_lod_tensor_only ? lod_tensors : m;
116+
}
117+
118+
/**
119+
* Step 2: build GCVarInfos, and calculate total memory sizes of each device
120+
*/
121+
122+
// place -> variable info (name, memory size, place, scope_idx)
123+
std::map<platform::Place, std::vector<GCVarInfo>> place_to_vars;
124+
125+
// place -> total memory sizes
126+
std::map<platform::Place, int64_t> place_to_size;
127+
for (auto &op_vars_pair : lod_tensors) {
128+
auto *op = op_vars_pair.first;
129+
auto &var_names = op_vars_pair.second;
130+
auto scope_idx = op->GetScopeIdx();
131+
auto &place = places[scope_idx];
132+
133+
for (auto &var_name : var_names) {
134+
auto var_size = GetMemorySize(vars[scope_idx], var_name);
135+
GCVarInfo var_info(var_name, var_size, op, scope_idx);
136+
place_to_size[place] += var_info.AbsMemorySize();
137+
place_to_vars[place].emplace_back(std::move(var_info));
138+
}
139+
}
140+
141+
/**
142+
* Step 3: sort GCVarInfos, and only delete the largest variables.
143+
*/
144+
OpToVarNameSetMap partial_vars;
145+
for (auto &place_to_var_pair : place_to_vars) {
146+
auto &place = place_to_var_pair.first;
147+
auto &gc_vars = place_to_var_pair.second;
148+
std::sort(gc_vars.begin(), gc_vars.end(),
149+
[](const GCVarInfo &var1, const GCVarInfo &var2) {
150+
return var1.AbsMemorySize() > var2.AbsMemorySize();
151+
});
152+
153+
int64_t accumulated_size = 0;
154+
int64_t size_threshold =
155+
static_cast<int64_t>(fraction_of_memory_size * place_to_size[place]);
156+
for (size_t i = 0; i < gc_vars.size() && accumulated_size < size_threshold;
157+
++i) {
158+
partial_vars[gc_vars[i].op_].insert(gc_vars[i].name_);
159+
accumulated_size += gc_vars[i].AbsMemorySize();
160+
}
161+
}
162+
163+
/**
164+
* Step 4: Combine other vars (SelectedRows, LoDTensorArray)
165+
*/
166+
if (!delete_lod_tensor_only) {
167+
for (auto &op_vars_pair : other_vars) {
168+
partial_vars[op_vars_pair.first].insert(op_vars_pair.second.begin(),
169+
op_vars_pair.second.end());
170+
}
171+
}
172+
173+
return partial_vars;
174+
}
175+
176+
class EagerDeletionPass : public ir::Pass {
177+
protected:
178+
std::unique_ptr<ir::Graph> ApplyImpl(
179+
std::unique_ptr<ir::Graph> graph) const override;
180+
};
181+
29182
std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
30183
std::unique_ptr<ir::Graph> graph) const {
31184
auto &ref_cnts =
@@ -43,9 +196,7 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
43196

44197
// a reverse map of last_live_ops
45198
// i.e., last op --> variable names which can be deleted.
46-
std::unordered_map<ComputationOpHandle *, std::unordered_set<std::string>>
47-
op_vars_map;
48-
199+
OpToVarNameSetMap op_vars_map;
49200
for (auto &var_ops_map : last_live_ops) {
50201
for (auto &var_ops_pair : var_ops_map) {
51202
const std::string &var_name = var_ops_pair.first;
@@ -55,6 +206,9 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
55206
}
56207
}
57208

209+
op_vars_map = ShrinkGCVars(op_vars_map, vars, places,
210+
FLAGS_memory_fraction_of_eager_deletion);
211+
58212
for (auto &pair : op_vars_map) {
59213
auto *op = pair.first;
60214
auto &var_names = pair.second;
@@ -85,8 +239,13 @@ std::unique_ptr<ir::Graph> EagerDeletionPass::ApplyImpl(
85239
eager_deletion_op->AddOutput(dummy_leaf);
86240
}
87241

242+
VLOG(10) << "FLAGS_memory_fraction_of_eager_deletion = "
243+
<< FLAGS_memory_fraction_of_eager_deletion;
88244
VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)";
89-
return graph;
245+
246+
auto while_op_eager_deletion_pass =
247+
ir::PassRegistry::Instance().Get("while_op_eager_deletion_pass");
248+
return while_op_eager_deletion_pass->Apply(std::move(graph));
90249
}
91250

92251
} // namespace details
@@ -99,3 +258,5 @@ REGISTER_PASS(eager_deletion_pass,
99258
.RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars)
100259
.RequirePassAttr(paddle::framework::details::kAllPlaces)
101260
.RequirePassAttr(paddle::framework::details::kGarbageCollector);
261+
262+
USE_PASS(while_op_eager_deletion_pass);

paddle/fluid/framework/details/eager_deletion_pass.h

Lines changed: 0 additions & 32 deletions
This file was deleted.

paddle/fluid/framework/details/inplace_op_pass.cc

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
#include <algorithm>
1717
#include <deque>
1818
#include <iterator>
19+
#include <memory>
1920
#include <stack>
2021
#include <string>
2122
#include <unordered_map>
@@ -263,6 +264,10 @@ void InplacePass::WithdrawModify(const NodeSwapQueue& nodes,
263264
void InplacePass::TryInplaceOpInputOutput(ir::Node* op,
264265
ir::Graph* graph) const {
265266
VLOG(4) << "Try to inplace op " << op->Name();
267+
// FIXME(liuwei1031): Graph is not aware of the existence of BlockDescs and
268+
// ProgramDescs.
269+
// The operations related to BlockDesc or ProgramDesc should perform on Graph
270+
// or Node directly!
266271
PADDLE_ENFORCE(op->Op() != nullptr && op->Op()->Block() != nullptr,
267272
"op_desc is nullptr");
268273
// some pre-requirments need to meet if the op want to inplaced.

0 commit comments

Comments
 (0)