Skip to content

Commit 2835e04

Browse files
committed
merge develop branch. test=develop
2 parents deb4af7 + 0953cd3 commit 2835e04

File tree

138 files changed

+5023
-585
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

138 files changed

+5023
-585
lines changed

CMakeLists.txt

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,6 @@ option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
6262
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
6363
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
6464
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
65-
option(WITH_FAST_BUNDLE_TEST "Bundle tests that can be run in a single process together to reduce launch overhead" OFF)
6665
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
6766
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
6867
option(WITH_ANAKIN "Compile with Anakin library" OFF)

cmake/configure.cmake

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -50,11 +50,7 @@ if(NOT WITH_PROFILER)
5050
endif(NOT WITH_PROFILER)
5151

5252
if(NOT CMAKE_CROSSCOMPILING)
53-
if(WITH_AVX AND AVX512F_FOUND)
54-
set(SIMD_FLAG ${AVX512F_FLAG})
55-
elseif(WITH_AVX AND AVX2_FOUND)
56-
set(SIMD_FLAG ${AVX2_FLAG})
57-
elseif(WITH_AVX AND AVX_FOUND)
53+
if(WITH_AVX AND AVX_FOUND)
5854
set(SIMD_FLAG ${AVX_FLAG})
5955
elseif(SSE3_FOUND)
6056
set(SIMD_FLAG ${SSE3_FLAG})

cmake/simd.cmake

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -89,7 +89,9 @@ CHECK_CXX_SOURCE_RUNS("
8989
#include <immintrin.h>
9090
int main()
9191
{
92-
__m512i a = _mm512_undefined_epi32();
92+
__m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4,
93+
13, -5, 6, -7, 9, 2, -6, 3);
94+
__m512i result = _mm512_abs_epi32 (a);
9395
return 0;
9496
}" AVX512F_FOUND)
9597

paddle/fluid/API.spec

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -67,8 +67,8 @@ paddle.fluid.layers.conv3d ArgSpec(args=['input', 'num_filters', 'filter_size',
6767
paddle.fluid.layers.sequence_pool ArgSpec(args=['input', 'pool_type', 'is_test'], varargs=None, keywords=None, defaults=(False,))
6868
paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(False, None))
6969
paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None))
70-
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
71-
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None))
70+
paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
71+
paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True))
7272
paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False))
7373
paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,))
7474
paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None))
@@ -103,7 +103,7 @@ paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 's
103103
paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None))
104104
paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None)
105105
paddle.fluid.layers.layer_norm ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None))
106-
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100))
106+
paddle.fluid.layers.softmax_with_cross_entropy ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode'], varargs=None, keywords=None, defaults=(False, -100, False))
107107
paddle.fluid.layers.smooth_l1 ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None))
108108
paddle.fluid.layers.one_hot ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None)
109109
paddle.fluid.layers.autoincreased_step_counter ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1))
@@ -174,9 +174,11 @@ paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None
174174
paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None))
175175
paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,))
176176
paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,))
177+
paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,))
177178
paddle.fluid.layers.sequence_reverse ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
178179
paddle.fluid.layers.affine_channel ArgSpec(args=['x', 'scale', 'bias', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(None, None, 'NCHW', None))
179180
paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None))
181+
paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,))
180182
paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None))
181183
paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,))
182184
paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))

paddle/fluid/framework/details/CMakeLists.txt

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
cc_library(var_handle SRCS var_handle.cc DEPS place framework_proto node)
22
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context lod_tensor)
3+
cc_library(op_graph_view SRCS op_graph_view.cc DEPS op_handle_base)
34
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
45
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
56
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
@@ -30,20 +31,25 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_
3031
cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor)
3132
cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope)
3233

33-
if(WITH_GPU)
34+
cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper)
35+
36+
if (WITH_GPU)
3437
cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle
3538
all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass)
3639
endif()
3740

41+
cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass)
42+
3843
cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle
3944
scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle)
4045

41-
if(WITH_GPU)
42-
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto reference_count_pass)
43-
else()
44-
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS graph framework_proto)
46+
set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass)
47+
if (WITH_GPU)
48+
list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass)
4549
endif()
4650

51+
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS})
52+
4753
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
4854
simple_threadpool device_context)
4955

paddle/fluid/framework/details/build_strategy.cc

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ limitations under the License. */
1616

1717
#include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h"
1818
#include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h"
19+
#include "paddle/fluid/framework/details/sequential_execution_pass.h"
1920
#include "paddle/fluid/framework/ir/graph.h"
2021
#include "paddle/fluid/framework/ir/graph_viz_pass.h"
2122

@@ -27,6 +28,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
2728
public:
2829
explicit ParallelExecutorPassBuilder(const BuildStrategy &strategy)
2930
: ir::PassBuilder(), strategy_(strategy) {
31+
if (strategy_.enable_sequential_execution_) {
32+
AppendPass("sequential_execution_pass");
33+
}
34+
3035
// Add a graph viz pass to record a graph.
3136
if (!strategy_.debug_graphviz_path_.empty()) {
3237
auto viz_pass = AppendPass("graph_viz_pass");
@@ -64,6 +69,10 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
6469

6570
// Verify that the graph is correct for multi-device executor.
6671
AppendPass("multi_devices_check_pass");
72+
73+
if (strategy_.remove_unnecessary_lock_) {
74+
AppendPass("modify_op_lock_and_record_event_pass");
75+
}
6776
}
6877

6978
private:
@@ -110,6 +119,11 @@ std::unique_ptr<ir::Graph> BuildStrategy::Apply(
110119
pass->Erase("nccl_ctxs");
111120
pass->SetNotOwned<platform::NCCLContextMap>("nccl_ctxs", nctx);
112121
#endif
122+
} else if (pass->Type() == "sequential_execution_pass") {
123+
pass->Erase(kAllOpDescs);
124+
pass->Set<const std::vector<OpDesc *>>(
125+
kAllOpDescs,
126+
new std::vector<OpDesc *>(main_program.Block(0).AllOps()));
113127
}
114128
graph = pass->Apply(std::move(graph));
115129
}
@@ -125,3 +139,5 @@ USE_PASS(multi_batch_merge_pass);
125139
USE_PASS(multi_devices_pass);
126140
USE_PASS(multi_devices_check_pass);
127141
USE_PASS(multi_devices_print_pass);
142+
USE_PASS(sequential_execution_pass);
143+
USE_PASS(modify_op_lock_and_record_event_pass);

paddle/fluid/framework/details/build_strategy.h

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -69,8 +69,12 @@ struct BuildStrategy {
6969

7070
bool enable_data_balance_{false};
7171

72+
bool enable_sequential_execution_{false};
73+
7274
bool fuse_broadcast_op_{false};
7375

76+
bool remove_unnecessary_lock_{false};
77+
7478
// User normally doesn't need to call this API.
7579
// The PassBuilder allows for more customized insert, remove of passes
7680
// from python side.

paddle/fluid/framework/details/computation_op_handle.cc

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,9 +29,15 @@ ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope,
2929
void ComputationOpHandle::RunImpl() {
3030
WaitInputVarGenerated(place_);
3131

32-
this->RunAndRecordEvent([this] {
32+
auto run_func = [this]() {
3333
op_->Run(*scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(), place_);
34-
});
34+
};
35+
36+
if (is_lock_and_record_event_free_) {
37+
run_func();
38+
} else {
39+
this->RunAndRecordEvent(run_func);
40+
}
3541
}
3642

3743
bool ComputationOpHandle::NeedWait(VarHandleBase *in_var) {

paddle/fluid/framework/details/computation_op_handle.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ struct ComputationOpHandle : public OpHandleBase {
3636

3737
const platform::Place &GetPlace() const { return place_; }
3838

39+
void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; }
40+
3941
protected:
4042
void RunImpl() override;
4143

@@ -45,6 +47,7 @@ struct ComputationOpHandle : public OpHandleBase {
4547
std::unique_ptr<OperatorBase> op_;
4648
Scope *scope_;
4749
platform::Place place_;
50+
bool is_lock_and_record_event_free_{false};
4851
};
4952
} // namespace details
5053
} // namespace framework
Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,59 @@
1+
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
2+
//
3+
// Licensed under the Apache License, Version 2.0 (the "License");
4+
// you may not use this file except in compliance with the License.
5+
// You may obtain a copy of the License at
6+
//
7+
// http://www.apache.org/licenses/LICENSE-2.0
8+
//
9+
// Unless required by applicable law or agreed to in writing, software
10+
// distributed under the License is distributed on an "AS IS" BASIS,
11+
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
// See the License for the specific language governing permissions and
13+
// limitations under the License.
14+
15+
#include "paddle/fluid/framework/details/modify_op_lock_and_record_event_pass.h"
16+
#include "paddle/fluid/framework/details/computation_op_handle.h"
17+
#include "paddle/fluid/framework/details/multi_devices_helper.h"
18+
#include "paddle/fluid/framework/details/op_graph_view.h"
19+
20+
namespace paddle {
21+
namespace framework {
22+
namespace details {
23+
24+
static bool IsLockAndRecordEventFreeComputationOpHandle(
25+
ComputationOpHandle *op, const OpGraphView &graph_view) {
26+
if (!platform::is_gpu_place(op->GetPlace())) return false;
27+
for (auto &pending_op : graph_view.PendingOps(op)) {
28+
auto *tmp = dynamic_cast<ComputationOpHandle *>(pending_op);
29+
if (tmp == nullptr || !(tmp->GetPlace() == op->GetPlace())) {
30+
return false;
31+
}
32+
}
33+
return true;
34+
}
35+
36+
std::unique_ptr<ir::Graph> ModifyOpLockAndRecordEventPass::ApplyImpl(
37+
std::unique_ptr<ir::Graph> ir_graph) const {
38+
auto &all_ops = ir_graph->Get<GraphOps>(kGraphOps);
39+
OpGraphView graph_view(all_ops);
40+
for (auto &op : all_ops) {
41+
auto *compute_op = dynamic_cast<ComputationOpHandle *>(op.get());
42+
if (compute_op == nullptr) continue;
43+
bool is_lock_and_record_event_free =
44+
IsLockAndRecordEventFreeComputationOpHandle(compute_op, graph_view);
45+
compute_op->SetLockAndRecordEventFree(is_lock_and_record_event_free);
46+
if (is_lock_and_record_event_free) {
47+
VLOG(10) << "Set is_lock_and_record_event_free be true in op "
48+
<< compute_op->DebugString();
49+
}
50+
}
51+
return ir_graph;
52+
}
53+
54+
} // namespace details
55+
} // namespace framework
56+
} // namespace paddle
57+
58+
REGISTER_PASS(modify_op_lock_and_record_event_pass,
59+
paddle::framework::details::ModifyOpLockAndRecordEventPass);

0 commit comments

Comments
 (0)