Skip to content

Commit 8360685

Browse files
committed
Merge remote-tracking branch 'ups/develop' into refine/op/gru
2 parents 18c322c + b656d97 commit 8360685

22 files changed

+285
-131
lines changed

doc/fluid/howto/optimization/timeline_cn.md

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,27 @@
11
# 如何使用timeline工具做性能分析
22

3-
1. 在训练的主循环外加上`with profiler.profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
3+
1. 在训练的主循环外加上`profiler.start_profiler(...)``profiler.stop_profiler(...)`。运行之后,代码会在`/tmp/profile`目录下生成一个profile的记录文件。
44

55
**提示:**
66
请不要在timeline记录信息时运行太多次迭代,因为timeline中的记录数量和迭代次数是成正比的。
77

88
```python
9-
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
10-
for pass_id in range(pass_num):
11-
for batch_id, data in enumerate(train_reader()):
12-
exe.run(fluid.default_main_program(),
13-
feed=feeder.feed(data),
14-
fetch_list=[])
9+
for pass_id in range(pass_num):
10+
for batch_id, data in enumerate(train_reader()):
11+
if pass_id == 0 and batch_id == 5:
12+
profiler.start_profiler("All")
13+
elif pass_id == 0 and batch_id == 10:
14+
profiler.stop_profiler("total", "/tmp/profile")
15+
exe.run(fluid.default_main_program(),
16+
feed=feeder.feed(data),
17+
fetch_list=[])
1518
...
1619
```
1720

1821
1. 运行`python paddle/tools/timeline.py`来处理`/tmp/profile`,这个程序默认会生成一个`/tmp/timeline`文件,你也可以用命令行参数来修改这个路径,请参考[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py)
22+
```python
23+
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
24+
```
1925

2026
1. 打开chrome浏览器,访问<chrome://tracing/>,用`load`按钮来加载生成的`timeline`文件。
2127

doc/fluid/howto/optimization/timeline_en.md

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,28 @@
11
# how to use timeline tool to do profile
22

3-
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
3+
1. Add `profiler.start_profiler(...)``profiler.stop_profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
44

55
```python
6-
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
7-
for pass_id in range(pass_num):
8-
for batch_id, data in enumerate(train_reader()):
9-
exe.run(fluid.default_main_program(),
10-
feed=feeder.feed(data),
11-
fetch_list=[],
12-
use_program_cache=True)
6+
for pass_id in range(pass_num):
7+
for batch_id, data in enumerate(train_reader()):
8+
if pass_id == 0 and batch_id == 5:
9+
profiler.start_profiler("All")
10+
elif pass_id == 0 and batch_id == 10:
11+
profiler.stop_profiler("total", "/tmp/profile")
12+
exe.run(fluid.default_main_program(),
13+
feed=feeder.feed(data),
14+
fetch_list=[])
1315
...
1416
```
1517

1618
1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
1719
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
1820
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
1921

22+
```python
23+
python Paddle/tools/timeline.py --profile_path=/tmp/profile --timeline_path=timeline
24+
```
25+
2026
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
2127

2228
![chrome tracing](./tracing.jpeg)

paddle/fluid/framework/details/all_reduce_op_handle.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include "paddle/fluid/framework/details/container_cast.h"
1818
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1919
#include "paddle/fluid/framework/details/variable_visitor.h"
20+
#include "paddle/fluid/platform/profiler.h"
2021

2122
namespace paddle {
2223
namespace framework {
@@ -45,6 +46,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
4546
#endif
4647

4748
void AllReduceOpHandle::RunImpl() {
49+
platform::RecordEvent r("all_reduce", nullptr);
4850
if (NoDummyInputSize() == 1) {
4951
return; // No need to all reduce when GPU count = 1;
5052
} else {

paddle/fluid/framework/details/reduce_op_handle.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,12 +16,14 @@
1616
#include "paddle/fluid/framework/details/container_cast.h"
1717
#include "paddle/fluid/framework/details/reduce_and_gather.h"
1818
#include "paddle/fluid/framework/details/variable_visitor.h"
19+
#include "paddle/fluid/platform/profiler.h"
1920

2021
namespace paddle {
2122
namespace framework {
2223
namespace details {
2324

2425
void ReduceOpHandle::RunImpl() {
26+
platform::RecordEvent r("reduce", nullptr);
2527
if (places_.size() == 1) return;
2628
// the input and output may have dummy var.
2729
auto in_var_handles = DynamicCast<VarHandle>(inputs_);

paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#include <string>
1818
#include <vector>
1919
#include "paddle/fluid/framework/executor.h"
20+
#include "paddle/fluid/platform/profiler.h"
2021

2122
namespace paddle {
2223
namespace framework {
@@ -62,6 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
6263
eptr = std::current_exception();
6364
}
6465

66+
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
6567
drop_scope_counter_ += 1;
6668
if (!fetch_tensors.empty() ||
6769
drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) {

paddle/fluid/framework/details/threaded_ssa_graph_executor.cc

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
1616

1717
#include "paddle/fluid/framework/details/ssa_graph_builder.h"
18+
#include "paddle/fluid/platform/profiler.h"
1819

1920
namespace paddle {
2021
namespace framework {
@@ -34,6 +35,8 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
3435

3536
FeedFetchList ThreadedSSAGraphExecutor::Run(
3637
const std::vector<std::string> &fetch_tensors) {
38+
std::unique_ptr<platform::RecordEvent> event(
39+
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
3740
std::unordered_map<OpHandleBase *, size_t> pending_ops;
3841
std::unordered_set<VarHandleBase *> pending_vars;
3942
BlockingQueue<VarHandleBase *> ready_vars;
@@ -84,6 +87,7 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
8487
// Clean run context
8588
run_op_futures_.clear();
8689
exception_holder_.Clear();
90+
event.reset(nullptr);
8791

8892
// Step 3. Execution
8993
while (!pending_vars.empty()) {

paddle/fluid/operators/lookup_table_op.cc

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -32,11 +32,16 @@ class LookupTableOp : public framework::OperatorWithKernel {
3232

3333
auto table_dims = ctx->GetInputDim("W");
3434
auto ids_dims = ctx->GetInputDim("Ids");
35+
int ids_rank = ids_dims.size();
3536

36-
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
37-
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
37+
PADDLE_ENFORCE_EQ(table_dims.size(), 2);
38+
PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1,
39+
"The last dimension of the 'Ids' tensor must be 1.");
3840

39-
ctx->SetOutputDim("Out", {ids_dims[0], table_dims[1]});
41+
auto output_dims =
42+
framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1));
43+
output_dims.push_back(table_dims[1]);
44+
ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
4045

4146
if (ctx->GetOutputsVarType("Out")[0] ==
4247
framework::proto::VarType::LOD_TENSOR) {
@@ -61,8 +66,7 @@ class LookupTableOpMaker : public framework::OpProtoAndCheckerMaker {
6166
AddInput("Ids",
6267
"An input with type int32 or int64 "
6368
"contains the ids to be looked up in W. "
64-
"Ids must be a column vector with rank = 2. "
65-
"The 2nd dimension size must be 1.");
69+
"The last dimension size must be 1.");
6670
AddOutput("Out", "The lookup results, which have the same type as W.");
6771
AddAttr<bool>("is_sparse",
6872
"(boolean, default false) "

paddle/fluid/operators/lookup_table_op.cu

Lines changed: 8 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -118,28 +118,31 @@ class LookupTableGradCUDAKernel : public framework::OpKernel<T> {
118118
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
119119

120120
auto *ids_data = ids->data<int64_t>();
121-
auto ids_dim = ids->dims();
121+
int64_t ids_num = ids->numel();
122122

123123
auto stream = dev_ctx.stream();
124124
// copy GPU memory to CPU pinned memory
125125
framework::Vector<int64_t> new_rows;
126-
new_rows.resize(ids_dim[0]);
126+
new_rows.resize(ids_num);
127127
auto gpu_place = boost::get<platform::CUDAPlace>(context.GetPlace());
128128

129129
// TODO(yuyang18): Strange code here.
130130
memory::Copy(platform::CPUPlace(),
131131
new_rows.CUDAMutableData(context.GetPlace()), gpu_place,
132-
ids_data, ids_dim[0] * sizeof(int64_t), stream);
132+
ids_data, ids_num * sizeof(int64_t), stream);
133133

134134
d_table->set_rows(new_rows);
135135

136136
auto *d_table_value = d_table->mutable_value();
137-
d_table_value->Resize({ids_dim[0], table->dims()[1]});
137+
d_table_value->Resize({ids_num, table->dims()[1]});
138138
d_table_value->mutable_data<T>(context.GetPlace());
139139

140140
auto *d_table_data = d_table_value->data<T>();
141141
auto *d_output_data = d_output->data<T>();
142-
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
142+
auto d_output_dims = d_output->dims();
143+
PADDLE_ENFORCE_EQ(
144+
d_table_value->dims(),
145+
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
143146
memory::Copy(gpu_place, d_table_data, gpu_place, d_output_data,
144147
d_output->numel() * sizeof(T), stream);
145148

paddle/fluid/operators/lookup_table_op.h

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -109,36 +109,38 @@ class LookupTableGradKernel : public framework::OpKernel<T> {
109109
auto *d_table = context.Output<SelectedRows>(framework::GradVarName("W"));
110110

111111
auto *ids_data = ids->data<int64_t>();
112-
auto ids_dim = ids->dims();
112+
int64_t ids_num = ids->numel();
113113

114114
framework::Vector<int64_t> new_rows;
115-
new_rows.reserve(ids_dim[0]);
116-
for (int64_t i = 0; i < ids_dim[0]; i++) {
115+
new_rows.reserve(ids_num);
116+
for (int64_t i = 0; i < ids_num; i++) {
117117
new_rows.push_back(ids_data[i]);
118118
}
119119
d_table->set_rows(new_rows);
120120

121121
auto *d_table_value = d_table->mutable_value();
122-
d_table_value->Resize({ids_dim[0], table_dim[1]});
122+
d_table_value->Resize({ids_num, table_dim[1]});
123123
d_table_value->mutable_data<T>(context.GetPlace());
124124

125125
d_table->set_height(table_dim[0]);
126126

127127
auto *d_output_data = d_output->data<T>();
128128
auto *d_table_data = d_table_value->data<T>();
129129

130-
PADDLE_ENFORCE_EQ(d_table_value->dims(), d_output->dims());
130+
auto d_output_dims = d_output->dims();
131+
PADDLE_ENFORCE_EQ(
132+
d_table_value->dims(),
133+
framework::flatten_to_2d(d_output_dims, d_output_dims.size() - 1));
131134
memcpy(d_table_data, d_output_data, sizeof(T) * d_output->numel());
132135
} else {
133136
auto *ids = context.Input<LoDTensor>("Ids");
134137
auto *d_output = context.Input<LoDTensor>(framework::GradVarName("Out"));
135138
auto *d_table = context.Output<LoDTensor>(framework::GradVarName("W"));
136139

137140
auto *ids_data = ids->data<int64_t>();
138-
auto ids_dim = ids->dims();
139141

140142
int N = table_dim[0];
141-
int D = d_output->dims()[1];
143+
int D = table_dim[1];
142144

143145
auto *d_output_data = d_output->data<T>();
144146
auto *d_table_data = d_table->mutable_data<T>(context.GetPlace());

paddle/fluid/operators/parallel_do_op.cc

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,6 @@ limitations under the License. */
1818
#include "paddle/fluid/framework/op_registry.h"
1919
#include "paddle/fluid/framework/threadpool.h"
2020
#include "paddle/fluid/operators/detail/safe_ref.h"
21-
#include "paddle/fluid/platform/profiler.h"
2221

2322
namespace paddle {
2423
namespace operators {
@@ -166,8 +165,6 @@ class ParallelDoOp : public framework::OperatorBase {
166165

167166
workers.emplace_back(
168167
framework::Async([program, cur_scope, place, block, place_idx] {
169-
// Give the thread an id to distinguish parallel block with same id.
170-
platform::RecordThread rt(static_cast<int>(place_idx) + 1);
171168
framework::Executor executor(place);
172169
executor.Run(*program, cur_scope, block->ID(),
173170
false /*create_local_scope*/);
@@ -244,8 +241,6 @@ class ParallelDoGradOp : public framework::OperatorBase {
244241
// execute
245242
workers.emplace_back(
246243
framework::Async([program, cur_scope, place, block, i] {
247-
// Give the thread an id to distinguish parallel block with same id.
248-
platform::RecordThread rt(static_cast<int>(i) + 1);
249244
framework::Executor executor(place);
250245
executor.Run(*program, cur_scope, block->ID(),
251246
false /*create_local_scope*/);

0 commit comments

Comments
 (0)